In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [2]:
df = pd.read_csv('adult.csv')
df.replace('?','Unknown', inplace = True)
df.replace('-','_', regex= True, inplace= True)
df.columns = df.columns.str.replace('.','_', regex = True)
encoder = LabelEncoder()
df.income = encoder.fit_transform(df.income)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,90,Unknown,77053,HS_grad,9,Widowed,Unknown,Not_in_family,White,Female,0,4356,40,United_States,0
1,82,Private,132870,HS_grad,9,Widowed,Exec_managerial,Not_in_family,White,Female,0,4356,18,United_States,0
2,66,Unknown,186061,Some_college,10,Widowed,Unknown,Unmarried,Black,Female,0,4356,40,United_States,0
3,54,Private,140359,7th_8th,4,Divorced,Machine_op_inspct,Unmarried,White,Female,0,3900,40,United_States,0
4,41,Private,264663,Some_college,10,Separated,Prof_specialty,Own_child,White,Female,0,3900,40,United_States,0


In [3]:
X = df.drop('income', axis = 1).copy()
y = df['income'].copy()

In [4]:
cat = [var for var in X.columns if X[var].dtype == 'O']
to_fill = []
for col in X[cat]:
    to_fill.append(X[col].drop_duplicates())
semi_final = pd.DataFrame(to_fill).T
base = X[:1]
ref_df = pd.concat([semi_final, base], axis = 0)
ref_df

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,Unknown,HS_grad,Widowed,Unknown,Not_in_family,White,Female,United_States,,,,,,
1,Private,,,Exec_managerial,,,,,,,,,,
2,,Some_college,,,Unmarried,Black,,,,,,,,
3,,7th_8th,Divorced,Machine_op_inspct,,,,,,,,,,
4,,,Separated,Prof_specialty,Own_child,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3804,,,,,,,,Laos,,,,,,
4460,,,,,,,,Thailand,,,,,,
5581,,,,,,,,Outlying_US(Guam_USVI_etc),,,,,,
8874,Never_worked,,,,,,,,,,,,,


In [93]:
for i in df[cat]:
    print(i, df[i].unique(),'total',df[i].nunique())
    print('***********************************')

workclass ['Unknown' 'Private' 'State_gov' 'Federal_gov' 'Self_emp_not_inc'
 'Self_emp_inc' 'Local_gov' 'Without_pay' 'Never_worked'] total 9
***********************************
education ['HS_grad' 'Some_college' '7th_8th' '10th' 'Doctorate' 'Prof_school'
 'Bachelors' 'Masters' '11th' 'Assoc_acdm' 'Assoc_voc' '1st_4th' '5th_6th'
 '12th' '9th' 'Preschool'] total 16
***********************************
marital_status ['Widowed' 'Divorced' 'Separated' 'Never_married' 'Married_civ_spouse'
 'Married_spouse_absent' 'Married_AF_spouse'] total 7
***********************************
occupation ['Unknown' 'Exec_managerial' 'Machine_op_inspct' 'Prof_specialty'
 'Other_service' 'Adm_clerical' 'Craft_repair' 'Transport_moving'
 'Handlers_cleaners' 'Sales' 'Farming_fishing' 'Tech_support'
 'Protective_serv' 'Armed_Forces' 'Priv_house_serv'] total 15
***********************************
relationship ['Not_in_family' 'Unmarried' 'Own_child' 'Other_relative' 'Husband' 'Wife'] total 6
********************

In [5]:
ref_df.to_csv('ref.csv')

In [6]:
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, stratify=y)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [7]:
# random test
clf = xgb.XGBClassifier(use_label_encoder=False)
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
def runner(model):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    print(cm)
    print(classification_report(y_test, preds))

In [9]:
runner(clf)

[[5804  377]
 [ 677 1283]]
              precision    recall  f1-score   support

           0       0.90      0.94      0.92      6181
           1       0.77      0.65      0.71      1960

    accuracy                           0.87      8141
   macro avg       0.83      0.80      0.81      8141
weighted avg       0.87      0.87      0.87      8141



In [23]:
params = {
    'n_estimators': [100,90],
    'max_depth': [5,6,4],
    'learning_rate' : [0.1,0.2,0.3],
    'colsample_bytree' : [1,0.9,0.8],
    'gamma' : [0,0.25],
    'scale_pos_weight': [25,0]
}

In [24]:
param_search = GridSearchCV( estimator = xgb.XGBClassifier(), param_grid= params, cv = 3, n_jobs = -1,
                            scoring = 'accuracy',verbose = 1)

In [25]:
param_search.fit(X_train, y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits






GridSearchCV(cv=3,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs...
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, va

In [26]:
param_search.best_params_

{'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 5,
 'n_estimators': 100,
 'scale_pos_weight': 0}

In [27]:
clf = xgb.XGBClassifier(use_label_encoder=False, colsample_bytree = 1, gamma= 0, learning_rate = 0.1, max_depth = 5,
                       n_estimators = 100, scale_pos_weight =0)

In [28]:
runner(clf)

[[6181    0]
 [1960    0]]
              precision    recall  f1-score   support

           0       0.76      1.00      0.86      6181
           1       0.00      0.00      0.00      1960

    accuracy                           0.76      8141
   macro avg       0.38      0.50      0.43      8141
weighted avg       0.58      0.76      0.66      8141



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
over = RandomOverSampler()
X_over , y_over = over.fit_sample(X, y)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_over, y_over, random_state = 42, stratify=y_over)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [84]:
clf = xgb.XGBClassifier(use_label_encoder=False, n_estimators = 700, learning_rate = 0.3)

In [85]:
runner(clf)

[[5393  787]
 [ 313 5867]]
              precision    recall  f1-score   support

           0       0.95      0.87      0.91      6180
           1       0.88      0.95      0.91      6180

    accuracy                           0.91     12360
   macro avg       0.91      0.91      0.91     12360
weighted avg       0.91      0.91      0.91     12360



In [14]:
from tensorflow import keras

In [69]:
# creating the model 
ann = keras.models.Sequential([
    keras.layers.Input(shape = X_train.shape[1], name= 'input_layer'),
    keras.layers.Dense(units = 200, activation = 'relu'),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(units = 100, activation = 'relu'),
    keras.layers.Dense(units = 50, activation = 'sigmoid'),
    keras.layers.Dense(units = 25, activation = 'sigmoid'),
    keras.layers.Dense(units = 1, activation= 'sigmoid')
])
ann.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 200)               21800     
_________________________________________________________________
dropout_4 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 100)               20100     
_________________________________________________________________
dense_17 (Dense)             (None, 50)                5050      
_________________________________________________________________
dense_18 (Dense)             (None, 25)                1275      
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 26        
Total params: 48,251
Trainable params: 48,251
Non-trainable params: 0
__________________________________________________

In [70]:
ann.compile(loss= 'binary_crossentropy', metrics = ['accuracy'], optimizer = 'adam')
s_best = keras.callbacks.ModelCheckpoint('best.h5', save_best_only=True)
e_stop = keras.callbacks.EarlyStopping(patience= 15, restore_best_weights=True)

In [71]:
ann.fit(X_train, y_train, validation_split=0.1, epochs = 200, verbose = 1, callbacks=[s_best, e_stop], batch_size = 32)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200


<tensorflow.python.keras.callbacks.History at 0x238ecb554c0>

In [72]:
ann.evaluate(X_test, y_test)



[0.3268395662307739, 0.8653721809387207]

In [73]:
confusion_matrix(y_test,ann.predict_classes(X_test))

array([[5107, 1073],
       [ 591, 5589]], dtype=int64)

In [74]:
print(classification_report(y_test,ann.predict_classes(X_test)))

              precision    recall  f1-score   support

           0       0.90      0.83      0.86      6180
           1       0.84      0.90      0.87      6180

    accuracy                           0.87     12360
   macro avg       0.87      0.87      0.87     12360
weighted avg       0.87      0.87      0.87     12360



In [86]:
import pickle
with open('xgb_adult_income.pkl','wb') as h:
    pickle.dump(clf, h)