In [11]:
import pandas as pd
import h2o
from h2o.estimators import H2ORandomForestEstimator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

In [1]:
# download the dataset from Kaggle
!kaggle datasets download -d uciml/adult-census-income -p ../Data

Dataset URL: https://www.kaggle.com/datasets/uciml/adult-census-income
License(s): CC0-1.0
Downloading adult-census-income.zip to ../Data
  0% 0.00/450k [00:00<?, ?B/s]
100% 450k/450k [00:00<00:00, 21.7MB/s]


In [2]:
# unzip the dataset
!unzip ../Data/adult-census-income.zip -d ../Data

Archive:  ../Data/adult-census-income.zip
  inflating: ../Data/adult.csv       


In [26]:
income = pd.read_csv("../Data/adult.csv")
income

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [27]:
# replacing '?' with NA values
print(income.isnull().sum())
print(income[income == '?'].count())

income.replace('?', pd.NA, inplace = True)

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
income            0
dtype: int64
age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64


In [28]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,13 mins 07 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_unknownUser_p2hamm
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.168 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


In [30]:
# splitting the dataset and converting DataFrames to H2OFrame
train_data, test_data = train_test_split(income, test_size = 0.2)

train_data_hf = h2o.H2OFrame(train_data)
test_data_hf = h2o.H2OFrame(test_data)

X = train_data_hf.columns
y = "income"
X.remove(y)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [31]:
# training the model
model = H2ORandomForestEstimator(ntrees = 1, sample_rate = 1, mtries = -2, max_depth = 10, min_rows = 1, balance_classes = True, min_split_improvement = 1e-5,  nfolds = 5)
model.train(x = X, y = y, training_frame = train_data_hf)

drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,1.0,1.0,4469.0,10.0,10.0,10.0,333.0,333.0,333.0

Unnamed: 0,<=50K,>50K,Error,Rate
<=50K,16480.0,3274.0,0.1657,(3274.0/19754.0)
>50K,1289.0,5005.0,0.2048,(1289.0/6294.0)
Total,17769.0,8279.0,0.1752,(4563.0/26048.0)

metric,threshold,value,idx
max f1,0.3545849,0.6868867,143.0
max f2,0.150403,0.7744332,237.0
max f0point5,0.6285135,0.7140293,64.0
max accuracy,0.5353363,0.8513897,83.0
max precision,0.9053929,0.9255867,10.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9935709,0.0
max absolute_mcc,0.4322128,0.5816042,113.0
max min_per_class_accuracy,0.32391,0.817151,155.0
max mean_per_class_accuracy,0.23663,0.8194811,189.0

group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
1,0.0603885,1.0,3.8044091,3.8044091,0.9192626,1.0,0.9192626,1.0,0.2297426,0.2297426,280.4409128,280.4409128,0.2233135
2,0.1131757,0.7517014,3.1332545,3.4913706,0.7570909,0.7867937,0.8436228,0.9005568,0.1653956,0.3951382,213.3254528,249.1370602,0.3718012
3,0.1500307,0.6297739,2.7719627,3.3146481,0.6697917,0.6977731,0.8009212,0.850743,0.1021608,0.497299,177.1962716,231.4648091,0.4579146
4,0.2088452,0.5264411,2.1368073,2.9829473,0.5163185,0.550518,0.7207721,0.7661943,0.1256752,0.6229743,113.680732,198.2947345,0.5460784
5,0.3033246,0.357284,1.5874791,2.5482875,0.3835839,0.4196169,0.6157448,0.6582426,0.1499841,0.7729584,58.7479133,154.8287521,0.619268
6,0.4065187,0.1745806,1.0238587,2.1613138,0.2473958,0.2628656,0.5222401,0.5578768,0.1056562,0.8786146,2.3858701,116.1313806,0.6225145
7,0.5012669,0.0648113,0.5567248,1.8580186,0.1345219,0.1074263,0.4489546,0.4727338,0.0527486,0.9313632,-44.3275194,85.8018589,0.5671332
8,0.6110258,0.0103592,0.1881815,1.5580648,0.0454704,0.0296958,0.3764765,0.3931506,0.0206546,0.9520178,-81.1818537,55.806481,0.4496385
9,0.7009367,0.0052828,0.0795194,1.368408,0.0192143,0.0071055,0.3306496,0.3436316,0.0071497,0.9591675,-92.0480568,36.8408042,0.3405079
10,0.8254377,0.000772,0.0178661,1.1647055,0.004317,0.0013637,0.2814288,0.2920073,0.0022243,0.9613918,-98.2133942,16.4705532,0.1792717

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8265784,0.0131698,0.8339943,0.8306053,0.8178840,0.8087242,0.8416845
aic,,0.0,,,,,
auc,0.8829654,0.0042683,0.8861682,0.8831471,0.8794089,0.8780554,0.8880474
err,0.1734216,0.0131698,0.1660057,0.1693948,0.1821160,0.1912758,0.1583155
err_count,903.4,68.06835,879.0,890.0,945.0,991.0,812.0
f0point5,0.6394253,0.0205847,0.6513385,0.6386908,0.6224350,0.6173623,0.6673001
f1,0.6877865,0.0059607,0.6910369,0.6926796,0.6793349,0.6836898,0.6921911
f2,0.7450415,0.0183130,0.7358886,0.7566385,0.7476845,0.7659848,0.7190109
lift_top_group,3.8033624,0.1180203,3.8105273,3.8454459,3.6109412,3.817426,3.9324713
loglikelihood,,0.0,,,,,

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_pr_auc,training_lift,training_classification_error
,2024-11-03 20:58:26,3.923 sec,0.0,,,,,,

variable,relative_importance,scaled_importance,percentage
relationship,2781.8457031,1.0,0.4743501
capital.gain,844.5419922,0.3035905,0.1440082
occupation,670.0756226,0.2408745,0.1142588
education,522.8447876,0.1879489,0.0891536
age,330.9204102,0.1189571,0.0564273
hours.per.week,209.1131744,0.0751707,0.0356572
capital.loss,145.1512299,0.052178,0.0247507
native.country,127.5263519,0.0458424,0.0217453
workclass,76.2504044,0.02741,0.0130019
fnlwgt,61.7528648,0.0221985,0.0105299


In [32]:
# predicting on the train and test sets
y_train_pred = model.predict(train_data_hf)
y_train_pred = y_train_pred.as_data_frame()
y_train = train_data['income'].values
y_train_pred = y_train_pred['predict'].values

y_test_pred = model.predict(test_data_hf)
y_test_pred = y_test_pred.as_data_frame()
y_test = test_data['income'].values
y_test_pred = y_test_pred['predict'].values

drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%





drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%





In [33]:
# model evaluatation
f1_train = f1_score(y_train, y_train_pred, average = 'weighted')
f1_test = f1_score(y_test, y_test_pred, average = 'weighted')

print("f1 Score on Training Set:", f1_train)
print("f1 Score on Test Set:", f1_test)

f1 Score on Training Set: 0.8670005971676996
f1 Score on Test Set: 0.8561516789853941
