In [58]:
import pandas as pd
import numpy as np

In [59]:
import warnings
warnings.filterwarnings('ignore')

Adult income dataset http://archive.ics.uci.edu/ml/datasets/Adult <br>
Data https://www.kaggle.com/uciml/adult-census-income or 'data/adult.csv'<br>
Missing attribute values denoted by " ?"

In [60]:
df = pd.read_csv('data/adult.csv')

In [61]:
df = df.replace('?', np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      30718 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  31978 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [62]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


### Impute missing values

In [63]:
columns = ['workclass','occupation', 'native.country']

In [64]:
print(df['workclass'].mode())
print(df['occupation'].mode())
print(df['native.country'].mode())

0    Private
dtype: object
0    Prof-specialty
dtype: object
0    United-States
dtype: object


In [65]:
for i in columns:
    m = df[i].mode()[0]
    df[i] = df[i].replace(np.nan, m)

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


### Check classes proportions

In [67]:
np.unique(df['income'], return_counts=True)

(array(['<=50K', '>50K'], dtype=object), array([24720,  7841], dtype=int64))

In [68]:
24720/32561

0.7591904425539756

In [69]:
# Classes are unbalanced: 76% and 24%

### Data preprocessing

In [70]:
from sklearn import preprocessing
enc = preprocessing.OneHotEncoder(dtype = 'int32')

In [71]:
label_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

In [72]:
df_label_columns = df[label_columns]

In [73]:
enc.fit(df_label_columns)
enc.categories_

[array(['Federal-gov', 'Local-gov', 'Never-worked', 'Private',
        'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'],
       dtype=object),
 array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
        'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
        'Masters', 'Preschool', 'Prof-school', 'Some-college'],
       dtype=object),
 array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
        'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
       dtype=object),
 array(['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
        'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
        'Other-service', 'Priv-house-serv', 'Prof-specialty',
        'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving'],
       dtype=object),
 array(['Husband', 'Not-in-family', 'Other-relative', 'Own-child',
        'Unmarried', 'Wife'], dtype=object),
 array(['Amer-Indian-Eskimo', 'Asian-Pa

In [74]:
enc_label_columns = enc.transform(df_label_columns).toarray()

In [75]:
enc_label_columns.shape

(32561, 99)

In [76]:
all_columns = df.columns
all_columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income'],
      dtype='object')

In [77]:
int_columns = list(set(all_columns) - set(label_columns))
int_columns

['income',
 'age',
 'education.num',
 'capital.loss',
 'hours.per.week',
 'fnlwgt',
 'capital.gain']

In [78]:
X = df[int_columns].drop('income', axis = 1).values

In [79]:
X = np.concatenate((X, enc_label_columns), axis = 1)

In [80]:
print(X.shape)

(32561, 105)


In [81]:
y = df['income'].values
print(y.shape)

(32561,)


### Splitting data into train and test

In [82]:
Rand = 1

In [83]:
from sklearn.model_selection import train_test_split
# Split to train and test
# 75% and 25% by default
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, random_state=Rand)
print(Xtrain.shape, Xtest.shape)

(24420, 105) (8141, 105)


In [84]:
print(np.unique(ytrain, return_counts = True))
print(np.unique(ytest, return_counts = True))

(array(['<=50K', '>50K'], dtype=object), array([18539,  5881], dtype=int64))
(array(['<=50K', '>50K'], dtype=object), array([6181, 1960], dtype=int64))


### Modeling

In [85]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [None]:
# Let's our target metric be f1_macro, as it's often used for imbalanced classes

### RF with default hyperparameters

In [86]:
rfc = RandomForestClassifier(random_state = Rand, n_jobs = -1) # n_jobs=-1 means using all processors
rfc.fit(Xtrain, ytrain)

RandomForestClassifier(n_jobs=-1, random_state=1)

In [87]:
ytrain_pred = rfc.predict(Xtrain)
print(confusion_matrix(ytrain, ytrain_pred))

target_names = ['<=50K', '>50K']
print(classification_report(ytrain, ytrain_pred, target_names=target_names))

[[18539     0]
 [    1  5880]]
              precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00     18539
        >50K       1.00      1.00      1.00      5881

    accuracy                           1.00     24420
   macro avg       1.00      1.00      1.00     24420
weighted avg       1.00      1.00      1.00     24420



In [88]:
ytest_pred = rfc.predict(Xtest)
print(confusion_matrix(ytest, ytest_pred))

target_names = ['<=50K', '>50K']
print(classification_report(ytest, ytest_pred, target_names=target_names))

[[5719  462]
 [ 705 1255]]
              precision    recall  f1-score   support

       <=50K       0.89      0.93      0.91      6181
        >50K       0.73      0.64      0.68      1960

    accuracy                           0.86      8141
   macro avg       0.81      0.78      0.80      8141
weighted avg       0.85      0.86      0.85      8141



In [100]:
# We observe overfitting. Let's try gridsearch for 2 hyperparameters: 
# max_depth (to deal with ovefitting) and class_weight (to balance classes)

### Grid search for class_weight value 

In [101]:
param_grid = {'class_weight':[None, 'balanced', 'balanced_subsample'], 'max_depth':[20, 30, 40]}
grid = GridSearchCV(RandomForestClassifier(random_state = Rand, n_jobs=-1), 
                    param_grid, scoring='f1_macro', cv=5) 

In [102]:
grid.fit(Xtrain, ytrain)

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=1),
             param_grid={'class_weight': [None, 'balanced',
                                          'balanced_subsample'],
                         'max_depth': [20, 30, 40]},
             scoring='f1_macro')

In [103]:
print(grid.best_score_)
print(grid.best_params_)

0.7976626781342532
{'class_weight': 'balanced_subsample', 'max_depth': 30}


In [104]:
best_rfc = grid.best_estimator_

In [105]:
ytest_pred = best_rfc.predict(Xtest)
print(confusion_matrix(ytest, ytest_pred))
print(classification_report(ytest, ytest_pred))

[[5487  694]
 [ 555 1405]]
              precision    recall  f1-score   support

       <=50K       0.91      0.89      0.90      6181
        >50K       0.67      0.72      0.69      1960

    accuracy                           0.85      8141
   macro avg       0.79      0.80      0.80      8141
weighted avg       0.85      0.85      0.85      8141



In [99]:
# Conclusion: with respect to the chosen metric, the obtained models are equivalent.
# One can try LogisticRegression and GradientBoosting