In [1]:
import numpy as np
import pandas as pd

In [6]:
train_data = pd.read_csv('data/adult.csv')


In [7]:
#Check if the dataset is balanced or not!
train_data['income'].value_counts()

<=50K    37155
>50K     11687
Name: income, dtype: int64

In [8]:
#Assessing the whole data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [9]:
#Checking unique entries in each column
train_data.nunique()

age                   74
workclass              9
fnlwgt             28523
education             16
educational-num       16
marital-status         7
occupation            15
relationship           6
race                   5
gender                 2
capital-gain         123
capital-loss          99
hours-per-week        96
native-country        42
income                 2
dtype: int64

In [10]:
# it seems like education and education.num are the same so we will remove the non numeric one 
train_data = train_data.drop(columns=['education'])


In [11]:
# look for missing values 
train_data.isnull().sum()

age                0
workclass          0
fnlwgt             0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [14]:
#there are some cells with a '?' inside them 
#/n which means the value is missing but it doen't show in the isnull() funciton
train_data = train_data.replace('?', np.NaN)


In [15]:
train_data.isnull().sum()

age                   0
workclass          2799
fnlwgt                0
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
capital-gain          0
capital-loss          0
hours-per-week        0
native-country      857
income                0
dtype: int64

In [17]:
sk_train = train_data.copy()


In [21]:
# Replacing missing values with most frequent value of each column

cols = ['workclass', 'occupation', 'native-country']

for col in cols:
    most_frequent = sk_train[col].value_counts().sort_values(ascending=False).index[0]
    sk_train[col].fillna(most_frequent, inplace=True)
    print('All the missing values in column', col, 'are replaced with', most_frequent)

All the missing values in column workclass are replaced with Private
All the missing values in column occupation are replaced with Prof-specialty
All the missing values in column native-country are replaced with United-States


In [22]:
# now we are converting the two categorical income types into numeric 

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
sk_train['income'] = le.fit_transform(sk_train['income'])
to_label = dict(zip(le.classes_, le.transform(le.classes_)))
to_class = dict(zip(le.transform(le.classes_), le.classes_))
print(to_label)

{'<=50K': 0, '>50K': 1}


In [24]:
# Ordinaly encoding the other categorical values 

from sklearn.preprocessing import OrdinalEncoder

categorical_cols = sk_train.select_dtypes(include = "object").columns

enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

sk_train[categorical_cols] = enc.fit_transform(sk_train[categorical_cols])


In [25]:
sk_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              48842 non-null  int64  
 1   workclass        48842 non-null  float64
 2   fnlwgt           48842 non-null  int64  
 3   educational-num  48842 non-null  int64  
 4   marital-status   48842 non-null  float64
 5   occupation       48842 non-null  float64
 6   relationship     48842 non-null  float64
 7   race             48842 non-null  float64
 8   gender           48842 non-null  float64
 9   capital-gain     48842 non-null  int64  
 10  capital-loss     48842 non-null  int64  
 11  hours-per-week   48842 non-null  int64  
 12  native-country   48842 non-null  float64
 13  income           48842 non-null  int64  
dtypes: float64(7), int64(7)
memory usage: 5.2 MB


In [26]:
sk_label = sk_train['income']
sk_train = sk_train.drop(columns=['income'])

## Scikit-learn

In [27]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(max_depth=10, min_samples_split=50, min_samples_leaf=8, class_weight={0: 0.4, 1: 0.6})
clf.fit(sk_train, sk_label)

In [28]:
print('-- Details of the acheived Decision Tree --')
print('Depth:', clf.get_depth())
print('Number of leaves:', clf.get_n_leaves())
print('\n--Feature importances --')
for feature, importance in sorted(zip(clf.feature_names_in_, clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(feature, '\t' ,importance)

-- Details of the acheived Decision Tree --
Depth: 10
Number of leaves: 220

--Feature importances --
relationship 	 0.4518263049589671
educational-num 	 0.1919357156262058
capital-gain 	 0.18509090877180215
capital-loss 	 0.051091839712875516
age 	 0.0495258986705502
hours-per-week 	 0.04156813132311751
occupation 	 0.009914979456598484
workclass 	 0.009015096325050743
fnlwgt 	 0.006309790816457513
gender 	 0.0019926412059697437
native-country 	 0.000968548439475521
race 	 0.0007601446929297824
marital-status 	 0.0


## H20 library

In [29]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,21 mins 35 secs
H2O_cluster_timezone:,Asia/Tehran
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.4
H2O_cluster_version_age:,2 years and 12 days !!!
H2O_cluster_name:,H2O_from_python_hamed_fuhgrk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.373 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [32]:
hf_train = h2o.H2OFrame(train_data)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [33]:
TARGET = 'income'
features = list(hf_train.columns)
features.remove(TARGET)

In [34]:
from h2o.estimators import H2ORandomForestEstimator

model = H2ORandomForestEstimator(ntrees=1, sample_rate=1, mtries=-2, max_depth=10,
                                 min_rows=8, min_split_improvement=1e-4,
                                 balance_classes=True, nfolds=5)

model.train(x=features, y=TARGET, training_frame=hf_train)

drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  DRF_model_python_1723740011878_19


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,1.0,1.0,4311.0,10.0,10.0,10.0,324.0,324.0,324.0




ModelMetricsBinomial: drf
** Reported on train data. **

MSE: NaN
RMSE: NaN
LogLoss: NaN
Mean Per-Class Error: NaN
AUC: NaN
AUCPR: NaN
Gini: NaN

ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.09966627650294724
RMSE: 0.3156996618670144
LogLoss: 0.4730936259851968
Mean Per-Class Error: 0.18451886329445527
AUC: 0.9015570406117387
AUCPR: 0.775584356659505
Gini: 0.8031140812234774

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3650238563653718: 


Unnamed: 0,Unnamed: 1,<=50K,>50K,Error,Rate
0,<=50K,31687.0,5468.0,0.1472,(5468.0/37155.0)
1,>50K,2593.0,9094.0,0.2219,(2593.0/11687.0)
2,Total,34280.0,14562.0,0.165,(8061.0/48842.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.365024,0.692903,140.0
1,max f2,0.133865,0.785061,266.0
2,max f0point5,0.626279,0.728195,70.0
3,max accuracy,0.523881,0.857152,93.0
4,max precision,0.996918,0.969242,1.0
5,max recall,0.0,1.0,399.0
6,max specificity,1.0,0.997685,0.0
7,max absolute_mcc,0.437944,0.592052,110.0
8,max min_per_class_accuracy,0.309383,0.82207,161.0
9,max mean_per_class_accuracy,0.259269,0.824713,188.0



Gains/Lift Table: Avg response rate: 23.93 %, avg score: 23.88 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.055895,1.0,4.047522,4.047522,0.968498,1.0,0.968498,1.0,0.226234,0.226234,304.752182,304.752182,0.22392
1,2,0.10319,0.754394,3.225743,3.670873,0.771861,0.792598,0.878373,0.904941,0.152563,0.378797,222.574296,267.087318,0.362299
2,3,0.154478,0.577084,2.752749,3.366049,0.658683,0.671992,0.805434,0.8276,0.141183,0.519979,175.274897,236.604864,0.480469
3,4,0.207281,0.502169,2.156836,3.058012,0.516092,0.531826,0.731727,0.752254,0.113887,0.633867,115.683592,205.801233,0.560768
4,5,0.300766,0.358073,1.577029,2.597688,0.377354,0.396185,0.621579,0.641579,0.147429,0.781295,57.702931,159.768773,0.631679
5,6,0.401622,0.196708,1.068973,2.213794,0.255786,0.268996,0.529721,0.548016,0.107812,0.889108,6.89725,121.379441,0.640823
6,7,0.501003,0.066682,0.560495,1.885837,0.134116,0.12804,0.451246,0.464707,0.055703,0.94481,-43.950517,88.583707,0.583406
7,8,0.600631,0.01792,0.261091,1.616338,0.062474,0.040697,0.38676,0.394376,0.026012,0.970822,-73.890902,61.633835,0.486634
8,9,0.717722,0.008075,0.08696,1.36683,0.020808,0.011531,0.327057,0.331917,0.010182,0.981005,-91.304045,36.682994,0.346097
9,10,0.817821,0.001713,0.031628,1.203406,0.007568,0.004098,0.287953,0.291793,0.003166,0.98417,-96.837197,20.340609,0.218675




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.831139,0.009321,0.829742,0.822022,0.827953,0.829089,0.84689
1,auc,0.901254,0.004068,0.902525,0.895616,0.898943,0.902992,0.906196
2,err,0.168861,0.009321,0.170258,0.177978,0.172047,0.170911,0.15311
3,err_count,1650.2,106.08817,1667.0,1757.0,1684.0,1671.0,1472.0
4,f0point5,0.643581,0.016472,0.634519,0.631303,0.637499,0.642467,0.672114
5,f1,0.693529,0.0056,0.693171,0.686977,0.689299,0.697994,0.700204
6,f2,0.752444,0.013593,0.763771,0.753419,0.750261,0.764026,0.730743
7,lift_top_group,4.050265,0.054941,4.137688,3.997333,4.024526,4.067439,4.02434
8,logloss,0.472981,0.032052,0.478759,0.504129,0.497574,0.424779,0.459666
9,max_per_class_error,0.201979,0.026815,0.180592,0.194653,0.202732,0.184544,0.247373



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
0,,2024-08-15 21:32:08,1.267 sec,0.0,,,,,,



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,relationship,5315.946777,1.0,0.504398
1,capital-gain,1534.770264,0.288711,0.145625
2,occupation,1152.582886,0.216816,0.109362
3,educational-num,919.391296,0.17295,0.087236
4,age,558.908875,0.105138,0.053032
5,capital-loss,380.115356,0.071505,0.036067
6,hours-per-week,336.503174,0.063301,0.031929
7,native-country,120.873344,0.022738,0.011469
8,workclass,97.427864,0.018327,0.009244
9,marital-status,43.811687,0.008242,0.004157




## Comparing the results

In [35]:
from sklearn.metrics import f1_score

sklearn_train_pr = clf.predict(sk_train)
sklearn_train_pr = [to_class[p] for p in sklearn_train_pr]


h2o_train_pr = model.predict(hf_train)

print('Scikit-learn performance on training data:',
      f1_score(sklearn_train_pr, train_data['income'], pos_label='>50K'))

print('H2O performance on training data:',
      f1_score(h2o_train_pr.as_data_frame()['predict'], train_data['income'], pos_label='>50K'))


drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
Scikit-learn performance on training data: 0.7090796277145812
H2O performance on training data: 0.6996821301884185
