# Model 8.1: Neural Network (sklearn)

## Import the libraries and cleaning of the dataset

In [3]:
### Importing the libraries
import numpy as np
import pandas as pd


#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config


from sklearn.neural_network import MLPClassifier


#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

In [4]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
train_df.head()

Unnamed: 0,id,occ_code_level2,age,stock_dividends,mig_chg_msa,tax_filer_stat,det_hh_summ,mig_prev_sunbelt,hisp_origin,education,wage_per_hour,capital_losses,vet_question,own_or_self,country_self,mig_move_reg,high_income,hs_college,class_worker,mig_same,unemp_reason,state_prev_res,ind_code_level2,race,country_mother,capital_gains,sex,ind_code_level1,citizenship,union_member,fam_under_18,marital_stat,region_prev_res,mig_chg_reg,country_father,occ_code_level1,full_or_part_emp,weeks_worked,det_hh_fam_stat,num_emp,vet_benefits
0,1,0,42.0,0.0,,Nonfiler,Householder,,All other,11th grade,0.0,,Not in universe,0,United-States,,0,Not in universe,Not in universe,Not in universe under 1 year old,Not in universe,Not in universe,0,Black,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Not in universe,,United-States,Not in universe,Not in labor force,0.0,Householder,0,2
1,2,18,56.0,,,,Householder,,All other,High school graduate,0.0,,Not in universe,2,United-States,,1,Not in universe,Self-employed-incorporated,Not in universe under 1 year old,,Not in universe,32,,United-States,,Male,Wholesale trade,Native- Born in the United States,Not in universe,Not in universe,Married-civilian spouse present,Not in universe,,United-States,Sales,Full-time schedules,,Householder,1,2
2,3,26,26.0,,,Joint both under 65,Householder,,All other,High school graduate,0.0,,Not in universe,0,Haiti,,0,Not in universe,Private,Not in universe under 1 year old,,Not in universe,41,,Haiti,,,Hospital services,Foreign born- Not a citizen of U S,Not in universe,Not in universe,,Not in universe,,Haiti,Adm support including clerical,Full-time schedules,,Householder,3,2
3,4,0,67.0,,MSA to MSA,Joint one under 65 & one 65+,Householder,No,All other,,0.0,,No,0,United-States,Same county,0,Not in universe,Not in universe,No,,North Carolina,0,Black,United-States,0.0,,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Northeast,Same county,United-States,Not in universe,Children or Armed Forces,0.0,Householder,0,1
4,5,0,,,Nonmover,Nonfiler,Child under 18 never married,Not in universe,All other,Children,0.0,0.0,Not in universe,0,United-States,Nonmover,0,Not in universe,Not in universe,Yes,Not in universe,Not in universe,0,White,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Mother only present,,Not in universe,Nonmover,United-States,Not in universe,Children or Armed Forces,,Child <18 never marr not in subfamily,0,0


In [5]:
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [6]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [7]:
X= train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [8]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

In [9]:
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", MLPClassifier(random_state=1))]
)


# score = cross_val_score(clf, X, y, cv=KFold(n_splits=5, shuffle=True, random_state=1),scoring = "accuracy",n_jobs = -1, verbose=2).mean()
# print(score)

In [9]:
# Max. 10 layers
# 5-20 neurons
# Best was (3, 6) - (8, 2) - (3, 2) with around 0.856
# (3, 6, 4) with 0.857
# (3, 2, 3) with 0.8576
# (4, 3, 3) with 0.858
# Try: 2 to 8 layers, 2 to 6 neurons each layer

In [14]:
# Create the list of all combinations of layers/neurons for the GridSearch
# 
list_combinations_layers_2 = []
list_combinations_layers_3 = []

# 2 layers
for i in np.arange(2, 10):
    for j in np.arange(2, 10):
        list_combinations_layers_2.append((i, j))

# 3 layers
for i in np.arange(3, 5):
    for j in np.arange(2, 5):
        for k in np.arange(2, 5):
            list_combinations_layers_3.append((i, j, k))        
        
display(list_combinations_layers_2)
display(list_combinations_layers_3)

[(2, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (3, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (4, 2),
 (4, 3),
 (4, 4),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (5, 2),
 (5, 3),
 (5, 4),
 (5, 5),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (6, 2),
 (6, 3),
 (6, 4),
 (6, 5),
 (6, 6),
 (6, 7),
 (6, 8),
 (6, 9),
 (7, 2),
 (7, 3),
 (7, 4),
 (7, 5),
 (7, 6),
 (7, 7),
 (7, 8),
 (7, 9),
 (8, 2),
 (8, 3),
 (8, 4),
 (8, 5),
 (8, 6),
 (8, 7),
 (8, 8),
 (8, 9),
 (9, 2),
 (9, 3),
 (9, 4),
 (9, 5),
 (9, 6),
 (9, 7),
 (9, 8),
 (9, 9)]

[(3, 3, 3),
 (3, 3, 4),
 (3, 3, 5),
 (3, 3, 6),
 (3, 4, 3),
 (3, 4, 4),
 (3, 4, 5),
 (3, 4, 6),
 (3, 5, 3),
 (3, 5, 4),
 (3, 5, 5),
 (3, 5, 6),
 (3, 6, 3),
 (3, 6, 4),
 (3, 6, 5),
 (3, 6, 6),
 (4, 3, 3),
 (4, 3, 4),
 (4, 3, 5),
 (4, 3, 6),
 (4, 4, 3),
 (4, 4, 4),
 (4, 4, 5),
 (4, 4, 6),
 (4, 5, 3),
 (4, 5, 4),
 (4, 5, 5),
 (4, 5, 6),
 (4, 6, 3),
 (4, 6, 4),
 (4, 6, 5),
 (4, 6, 6),
 (5, 3, 3),
 (5, 3, 4),
 (5, 3, 5),
 (5, 3, 6),
 (5, 4, 3),
 (5, 4, 4),
 (5, 4, 5),
 (5, 4, 6),
 (5, 5, 3),
 (5, 5, 4),
 (5, 5, 5),
 (5, 5, 6),
 (5, 6, 3),
 (5, 6, 4),
 (5, 6, 5),
 (5, 6, 6),
 (6, 3, 3),
 (6, 3, 4),
 (6, 3, 5),
 (6, 3, 6),
 (6, 4, 3),
 (6, 4, 4),
 (6, 4, 5),
 (6, 4, 6),
 (6, 5, 3),
 (6, 5, 4),
 (6, 5, 5),
 (6, 5, 6),
 (6, 6, 3),
 (6, 6, 4),
 (6, 6, 5),
 (6, 6, 6)]

## For 2 Layers Only: First Try

In [11]:
hyper_param_2 = {
    "classifier__hidden_layer_sizes": list_combinations_layers_2, 
              "classifier__activation": ['relu'],
              "classifier__solver": ['adam'],
              "classifier__alpha": [0.07],
              "classifier__learning_rate": ['adaptive'],
              "classifier__max_iter": [500],
}

hyper_param_2

{'classifier__hidden_layer_sizes': [(2, 2),
  (2, 3),
  (2, 4),
  (2, 5),
  (2, 6),
  (2, 7),
  (2, 8),
  (2, 9),
  (3, 2),
  (3, 3),
  (3, 4),
  (3, 5),
  (3, 6),
  (3, 7),
  (3, 8),
  (3, 9),
  (4, 2),
  (4, 3),
  (4, 4),
  (4, 5),
  (4, 6),
  (4, 7),
  (4, 8),
  (4, 9),
  (5, 2),
  (5, 3),
  (5, 4),
  (5, 5),
  (5, 6),
  (5, 7),
  (5, 8),
  (5, 9),
  (6, 2),
  (6, 3),
  (6, 4),
  (6, 5),
  (6, 6),
  (6, 7),
  (6, 8),
  (6, 9),
  (7, 2),
  (7, 3),
  (7, 4),
  (7, 5),
  (7, 6),
  (7, 7),
  (7, 8),
  (7, 9),
  (8, 2),
  (8, 3),
  (8, 4),
  (8, 5),
  (8, 6),
  (8, 7),
  (8, 8),
  (8, 9),
  (9, 2),
  (9, 3),
  (9, 4),
  (9, 5),
  (9, 6),
  (9, 7),
  (9, 8),
  (9, 9)],
 'classifier__activation': ['relu'],
 'classifier__solver': ['adam'],
 'classifier__alpha': [0.05, 0.07],
 'classifier__learning_rate': ['adaptive'],
 'classifier__max_iter': [500]}

In [12]:
nn_grid_search_2_cv = GridSearchCV(estimator = clf, param_grid=hyper_param_2, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [13]:
nn_grid_search_2_cv.fit(X, y)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer_num',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['occ_code_level2',
                                                                          'age',
                                                                          'stock_dividends',
                                                                          'wage_per_hour',
                 

In [25]:
cv_res_2 = pd.DataFrame(nn_grid_search_2_cv.cv_results_)
pd.set_option("display.max_rows", None)
cv_res_2.sort_values(by="mean_test_score", ascending = False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
124,23.356496,4.37946,0.114901,0.036194,relu,0.07,"(9, 6)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.864311,0.862388,0.860361,0.861093,0.857431,0.861117,0.002279,1
115,24.245143,2.764926,0.089091,0.007372,relu,0.07,"(8, 5)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.863853,0.865318,0.856973,0.857797,0.860452,0.860879,0.003271,2
85,14.458806,2.865559,0.087635,0.008237,relu,0.07,"(4, 7)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.863395,0.862113,0.858346,0.860635,0.858804,0.860659,0.001918,3
117,24.330876,6.371064,0.089167,0.009662,relu,0.07,"(8, 7)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.862296,0.864402,0.858713,0.858346,0.85917,0.860586,0.002367,4
98,19.456142,6.487702,0.082803,0.00563,relu,0.07,"(6, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.861655,0.862205,0.859995,0.858987,0.859811,0.860531,0.001205,5
51,24.36964,3.331783,0.098046,0.01352,relu,0.05,"(8, 5)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.862022,0.864585,0.857522,0.859537,0.858163,0.860366,0.002614,6
112,18.753976,4.657214,0.08755,0.009357,relu,0.07,"(8, 2)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.86193,0.861289,0.858713,0.858987,0.860361,0.860256,0.001255,7
90,19.502033,5.313578,0.08853,0.005282,relu,0.07,"(5, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.862754,0.863212,0.855691,0.858529,0.861093,0.860256,0.002811,8
120,23.323514,6.193535,0.091468,0.013306,relu,0.07,"(9, 2)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.861014,0.865318,0.857248,0.859262,0.858438,0.860256,0.002812,9
42,24.981552,2.707485,0.088823,0.014187,relu,0.05,"(7, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.865409,0.862205,0.857888,0.85679,0.858896,0.860238,0.003157,10


In [18]:
#nn_grid_search_2_cv.best_params_

In [32]:
mean_scores = nn_grid_search_2_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = nn_grid_search_2_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res_2[cv_res_2["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
123,20.093515,4.633449,0.091102,0.00801,relu,0.07,"(9, 5)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.862022,0.864585,0.858621,0.85972,0.855599,0.860109,0.003046,15


## For 2 Layers Only: Further Tuning

In [23]:
list_combinations_layers_2_bis = [(9,6), (4,7), (8,7), (8,5), (5,4), (9,2), (7,4), (5,9), (9,5), (8,2), (6,4), (8,9), (4,4), (7,5)]
list_combinations_layers_2_bis

[(9, 6),
 (4, 7),
 (8, 7),
 (8, 5),
 (5, 4),
 (9, 2),
 (7, 4),
 (5, 9),
 (9, 5),
 (8, 2),
 (6, 4),
 (8, 9),
 (4, 4),
 (7, 5)]

In [24]:
hyper_param_2_bis = {
    "classifier__hidden_layer_sizes": list_combinations_layers_2, 
              "classifier__activation": ['relu'],
              "classifier__solver": ['adam', 'sgd'],
              "classifier__alpha": [0.05, 0.07, 0.1, 0.15],
              "classifier__learning_rate": ['adaptive'],
              "classifier__max_iter": [500],
}

hyper_param_2_bis

{'classifier__hidden_layer_sizes': [(9, 6),
  (4, 7),
  (8, 7),
  (8, 5),
  (5, 4),
  (9, 2),
  (7, 4),
  (5, 9),
  (9, 5),
  (8, 2),
  (6, 4),
  (8, 9),
  (4, 4),
  (7, 5)],
 'classifier__activation': ['relu'],
 'classifier__solver': ['adam', 'sgd'],
 'classifier__alpha': [0.05, 0.07, 0.1, 0.15],
 'classifier__learning_rate': ['adaptive'],
 'classifier__max_iter': [500]}

In [25]:
nn_grid_search_2_bis_cv = GridSearchCV(estimator = clf, param_grid=hyper_param_2_bis, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [26]:
nn_grid_search_2_bis_cv.fit(X, y)

Fitting 5 folds for each of 112 candidates, totalling 560 fits




GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer_num',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['occ_code_level2',
                                                                          'age',
                                                                          'stock_dividends',
                                                                          'wage_per_hour',
                 

In [27]:
cv_res_2_bis = pd.DataFrame(nn_grid_search_2_bis_cv.cv_results_)
pd.set_option("display.max_rows", None)
cv_res_2_bis.sort_values(by="mean_test_score", ascending = False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
75,48.224739,10.300063,0.089994,0.008707,relu,0.1,"(8, 2)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.864494,0.864585,0.862558,0.859354,0.861185,0.862435,0.001997,1
7,43.324375,3.821641,0.094835,0.015218,relu,0.05,"(8, 5)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.86193,0.863578,0.861551,0.862009,0.861917,0.862197,0.000708,2
35,45.631303,4.516217,0.090557,0.004387,relu,0.07,"(8, 5)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.862571,0.863761,0.86146,0.861368,0.861643,0.86216,0.000908,3
47,57.769066,9.106674,0.094443,0.006186,relu,0.07,"(8, 2)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.863029,0.863578,0.862192,0.859628,0.862009,0.862087,0.001354,4
67,61.525934,8.427042,0.08684,0.010787,relu,0.1,"(9, 2)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.865043,0.862846,0.860727,0.86146,0.859811,0.861977,0.001826,5
91,45.414496,4.205337,0.087752,0.009657,relu,0.15,"(8, 5)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.862571,0.86367,0.860819,0.861276,0.861368,0.861941,0.001041,6
11,58.092817,6.151161,0.088214,0.00699,relu,0.05,"(9, 2)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.864311,0.863487,0.861734,0.862009,0.858163,0.861941,0.002113,7
63,44.565576,4.401295,0.098359,0.006002,relu,0.1,"(8, 5)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.862479,0.86312,0.861185,0.861093,0.861368,0.861849,0.000807,8
19,55.821003,7.258525,0.084959,0.004683,relu,0.05,"(8, 2)",adaptive,500,sgd,"{'classifier__activation': 'relu', 'classifier...",0.862479,0.863578,0.862284,0.859995,0.860727,0.861813,0.001286,9
88,14.232385,3.84924,0.083856,0.005873,relu,0.15,"(8, 7)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.863761,0.859733,0.861643,0.86146,0.862375,0.861794,0.001311,10


In [28]:
mean_scores = nn_grid_search_2_bis_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = nn_grid_search_2_bis_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res_2_bis[cv_res_2_bis["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
84,23.896105,3.810573,0.095188,0.018649,relu,0.15,"(9, 6)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.867973,0.862388,0.857888,0.860727,0.858896,0.861574,0.003552,16


## For 2 Layers Only: Last Tuning

In [36]:
list_combinations_layers_2_ter = [(9,6), (4,7), (8,7), (8,5), (5,4), (9,2), (7,4), (5,9), (9,5), (8,2), (6,4), (8,9), (4,4), (7,5)]
list_combinations_layers_2_ter

[(9, 6),
 (4, 7),
 (8, 7),
 (8, 5),
 (5, 4),
 (9, 2),
 (7, 4),
 (5, 9),
 (9, 5),
 (8, 2),
 (6, 4),
 (8, 9),
 (4, 4),
 (7, 5)]

In [37]:
hyper_param_2_ter = {
    "classifier__hidden_layer_sizes": list_combinations_layers_2, 
              "classifier__activation": ['relu'],
              "classifier__solver": ['adam'],
              "classifier__alpha": [0.15, 0.2],
              "classifier__learning_rate": ['adaptive'],
              "classifier__max_iter": [700],
}

hyper_param_2_ter

{'classifier__hidden_layer_sizes': [(9, 6),
  (4, 7),
  (8, 7),
  (8, 5),
  (5, 4),
  (9, 2),
  (7, 4),
  (5, 9),
  (9, 5),
  (8, 2),
  (6, 4),
  (8, 9),
  (4, 4),
  (7, 5)],
 'classifier__activation': ['relu'],
 'classifier__solver': ['adam'],
 'classifier__alpha': [0.15, 0.2],
 'classifier__learning_rate': ['adaptive'],
 'classifier__max_iter': [700]}

In [38]:
nn_grid_search_2_ter_cv = GridSearchCV(estimator = clf, param_grid=hyper_param_2_ter, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [39]:
nn_grid_search_2_ter_cv.fit(X, y)

Fitting 5 folds for each of 28 candidates, totalling 140 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer_num',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['occ_code_level2',
                                                                          'age',
                                                                          'stock_dividends',
                                                                          'wage_per_hour',
                 

In [40]:
cv_res_2_ter = pd.DataFrame(nn_grid_search_2_ter_cv.cv_results_)
pd.set_option("display.max_rows", None)
cv_res_2_ter.sort_values(by="mean_test_score", ascending = False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,13.922922,2.955729,0.101816,0.019364,relu,0.15,"(8, 7)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.863761,0.859733,0.861643,0.86146,0.862375,0.861794,0.001311,1
0,21.098867,2.534089,0.110552,0.008046,relu,0.15,"(9, 6)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.867973,0.862388,0.857888,0.860727,0.858896,0.861574,0.003552,2
16,15.431111,4.730634,0.093619,0.007955,relu,0.2,"(8, 7)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.865501,0.859641,0.861185,0.859537,0.86146,0.861465,0.002164,3
9,18.53976,3.991506,0.087995,0.004409,relu,0.15,"(8, 2)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.863944,0.864494,0.858346,0.859079,0.86091,0.861355,0.002489,4
17,21.479438,9.34453,0.087648,0.009255,relu,0.2,"(8, 5)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.864127,0.864219,0.860544,0.858438,0.858621,0.86119,0.002545,5
13,15.713836,5.900339,0.086524,0.007641,relu,0.15,"(7, 5)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.863212,0.863395,0.859903,0.859537,0.85972,0.861153,0.00176,6
20,16.615588,2.856402,0.084787,0.004108,relu,0.2,"(7, 4)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.864127,0.863395,0.857705,0.859354,0.860727,0.861062,0.002414,7
11,14.797997,2.967387,0.093971,0.01384,relu,0.15,"(8, 9)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.86367,0.864768,0.860544,0.857888,0.858163,0.861007,0.002802,8
14,20.920825,3.717518,0.093516,0.010958,relu,0.2,"(9, 6)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.864768,0.860282,0.85917,0.860086,0.860452,0.860952,0.001959,9
19,19.339567,4.108686,0.092109,0.008733,relu,0.2,"(9, 2)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.865226,0.860099,0.858896,0.860544,0.859903,0.860934,0.002213,10


In [41]:
mean_scores = nn_grid_search_2_ter_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = nn_grid_search_2_ter_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res_2_ter[cv_res_2_ter["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,18.53976,3.991506,0.087995,0.004409,relu,0.15,"(8, 2)",adaptive,700,adam,"{'classifier__activation': 'relu', 'classifier...",0.863944,0.864494,0.858346,0.859079,0.86091,0.861355,0.002489,4


## For 3 Layers Only: First Try

In [191]:
hyper_param_3 = {
    "classifier__hidden_layer_sizes": list_combinations_layers_3, 
              "classifier__activation": ['relu'],
              "classifier__solver": ['adam'],
              "classifier__alpha": [0.05, 0.07],
              "classifier__learning_rate": ['adaptive'],
              "classifier__max_iter": [500],
}

hyper_param_3

{'classifier__hidden_layer_sizes': [(2, 2, 2),
  (2, 2, 3),
  (2, 2, 4),
  (2, 3, 2),
  (2, 3, 3),
  (2, 3, 4),
  (2, 4, 2),
  (2, 4, 3),
  (2, 4, 4),
  (3, 2, 2),
  (3, 2, 3),
  (3, 2, 4),
  (3, 3, 2),
  (3, 3, 3),
  (3, 3, 4),
  (3, 4, 2),
  (3, 4, 3),
  (3, 4, 4),
  (4, 2, 2),
  (4, 2, 3),
  (4, 2, 4),
  (4, 3, 2),
  (4, 3, 3),
  (4, 3, 4),
  (4, 4, 2),
  (4, 4, 3),
  (4, 4, 4)],
 'classifier__activation': ['relu'],
 'classifier__solver': ['adam'],
 'classifier__alpha': [0.05, 0.07],
 'classifier__learning_rate': ['adaptive'],
 'classifier__max_iter': [500]}

In [192]:
nn_grid_search_3_cv = GridSearchCV(estimator = clf, param_grid=hyper_param_3, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [193]:
nn_grid_search_3_cv.fit(X, y)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer_num',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['occ_code_level2',
                                                                          'age',
                                                                          'stock_dividends',
                                                                          'wage_per_hour',
                 

In [194]:
cv_res_3 = pd.DataFrame(nn_grid_search_3_cv.cv_results_)
cv_res_3

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,10.128447,4.324626,0.082921,0.007652,relu,0.05,"(2, 2, 2)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.860007,0.857169,0.85679,0.859262,0.696456,0.825937,0.064752,45
1,16.825178,2.596659,0.083624,0.003457,relu,0.05,"(2, 2, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.860282,0.857627,0.854409,0.857248,0.857797,0.857472,0.001869,31
2,13.038758,5.210331,0.081968,0.001906,relu,0.05,"(2, 2, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.860557,0.85955,0.852211,0.858529,0.696456,0.825461,0.064568,48
3,16.452147,1.647914,0.08725,0.007125,relu,0.05,"(2, 3, 2)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.862846,0.859183,0.855691,0.857431,0.854226,0.857875,0.00299,24
4,11.005495,4.174497,0.082096,0.005611,relu,0.05,"(2, 3, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.859092,0.857535,0.856973,0.858804,0.696456,0.825772,0.064663,46
5,9.694295,1.492008,0.090998,0.009761,relu,0.05,"(2, 3, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.859916,0.859366,0.857522,0.858346,0.856607,0.858351,0.001201,15
6,9.889145,11.219328,0.086501,0.008817,relu,0.05,"(2, 4, 2)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.689251,0.689526,0.696548,0.690871,0.857156,0.72467,0.066295,51
7,11.690574,5.140968,0.08034,0.006791,relu,0.05,"(2, 4, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.858359,0.857627,0.855691,0.858713,0.855233,0.857125,0.001409,36
8,11.40309,2.586638,0.087042,0.005475,relu,0.05,"(2, 4, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.859183,0.856437,0.85624,0.858255,0.855141,0.857051,0.001461,37
9,17.792281,1.578882,0.083475,0.004781,relu,0.05,"(3, 2, 2)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.861564,0.858176,0.854043,0.856881,0.856332,0.857399,0.002475,32


In [195]:
nn_grid_search_3_cv.best_params_

{'classifier__activation': 'relu',
 'classifier__alpha': 0.05,
 'classifier__hidden_layer_sizes': (4, 3, 2),
 'classifier__learning_rate': 'adaptive',
 'classifier__max_iter': 500,
 'classifier__solver': 'adam'}

In [196]:
mean_scores = nn_grid_search_3_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = nn_grid_search_3_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[cv_res["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate_init,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,14.882455,3.683285,0.080744,0.005577,relu,0.05,"(4, 4)",0.001,300,adam,"{'classifier__activation': 'relu', 'classifier...",0.862754,0.861747,0.857156,0.860727,0.857248,0.859926,0.002315,2


## For 3 Layers Only: Further Tuning

In [17]:
list_combinations_layers_3_bis = []
for i in np.arange(3, 7):
    for j in np.arange(3, 7):
        for k in np.arange(3, 7):
            list_combinations_layers_3_bis.append((i, j, k))        

In [18]:
hyper_param_3_bis = {
    "classifier__hidden_layer_sizes": list_combinations_layers_3_bis, 
              "classifier__activation": ['relu'],
              "classifier__solver": ['adam'],
              "classifier__alpha": [0.05],
              "classifier__learning_rate": ['adaptive'],
              "classifier__max_iter": [500],
}

hyper_param_3_bis

{'classifier__hidden_layer_sizes': [(3, 3, 3),
  (3, 3, 4),
  (3, 3, 5),
  (3, 3, 6),
  (3, 4, 3),
  (3, 4, 4),
  (3, 4, 5),
  (3, 4, 6),
  (3, 5, 3),
  (3, 5, 4),
  (3, 5, 5),
  (3, 5, 6),
  (3, 6, 3),
  (3, 6, 4),
  (3, 6, 5),
  (3, 6, 6),
  (4, 3, 3),
  (4, 3, 4),
  (4, 3, 5),
  (4, 3, 6),
  (4, 4, 3),
  (4, 4, 4),
  (4, 4, 5),
  (4, 4, 6),
  (4, 5, 3),
  (4, 5, 4),
  (4, 5, 5),
  (4, 5, 6),
  (4, 6, 3),
  (4, 6, 4),
  (4, 6, 5),
  (4, 6, 6),
  (5, 3, 3),
  (5, 3, 4),
  (5, 3, 5),
  (5, 3, 6),
  (5, 4, 3),
  (5, 4, 4),
  (5, 4, 5),
  (5, 4, 6),
  (5, 5, 3),
  (5, 5, 4),
  (5, 5, 5),
  (5, 5, 6),
  (5, 6, 3),
  (5, 6, 4),
  (5, 6, 5),
  (5, 6, 6),
  (6, 3, 3),
  (6, 3, 4),
  (6, 3, 5),
  (6, 3, 6),
  (6, 4, 3),
  (6, 4, 4),
  (6, 4, 5),
  (6, 4, 6),
  (6, 5, 3),
  (6, 5, 4),
  (6, 5, 5),
  (6, 5, 6),
  (6, 6, 3),
  (6, 6, 4),
  (6, 6, 5),
  (6, 6, 6)],
 'classifier__activation': ['relu'],
 'classifier__solver': ['adam'],
 'classifier__alpha': [0.05],
 'classifier__learning_rate': ['a

In [23]:
nn_grid_search_3_bis_cv = GridSearchCV(estimator = clf, param_grid=hyper_param_3_bis, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [24]:
nn_grid_search_3_bis_cv.fit(X, y)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer_num',
                                                                                          SimpleImputer()),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['occ_code_level2',
                                                                          'age',
                                                                          'stock_dividends',
                                                                          'wage_per_hour',
                 

In [25]:
cv_res_3_bis = pd.DataFrame(nn_grid_search_3_bis_cv.cv_results_)
cv_res_3_bis

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,24.136612,12.802592,0.096552,0.003779,relu,0.05,"(3, 3, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.689251,0.860007,0.855508,0.859903,0.857522,0.824438,0.067614,63
1,25.422111,5.761657,0.122802,0.040248,relu,0.05,"(3, 3, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.859092,0.859733,0.855508,0.856423,0.858163,0.857784,0.001593,51
2,22.257417,4.245060,0.117979,0.028292,relu,0.05,"(3, 3, 5)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.861747,0.858268,0.855325,0.855325,0.855691,0.857271,0.002494,56
3,24.826344,8.082648,0.108971,0.026733,relu,0.05,"(3, 3, 6)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.859366,0.859916,0.851387,0.857064,0.857431,0.857033,0.003026,59
4,26.033522,3.349086,0.117198,0.035801,relu,0.05,"(3, 4, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.859733,0.861747,0.853768,0.857248,0.854226,0.857344,0.003086,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59,19.768195,5.414982,0.092075,0.013535,relu,0.05,"(6, 5, 6)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.861472,0.860465,0.856881,0.859720,0.858896,0.859487,0.001555,11
60,24.913933,4.022864,0.078287,0.001567,relu,0.05,"(6, 6, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.862937,0.859275,0.857156,0.857064,0.859537,0.859194,0.002137,15
61,26.132830,3.378800,0.105675,0.019195,relu,0.05,"(6, 6, 4)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.860007,0.859366,0.857339,0.853860,0.857156,0.857546,0.002152,53
62,27.450243,4.270590,0.077526,0.004023,relu,0.05,"(6, 6, 5)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.860557,0.860099,0.859720,0.859445,0.858804,0.859725,0.000593,5


In [26]:
mean_scores = nn_grid_search_3_bis_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = nn_grid_search_3_bis_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res_3_bis[cv_res_3_bis["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__activation,param_classifier__alpha,param_classifier__hidden_layer_sizes,param_classifier__learning_rate,param_classifier__max_iter,param_classifier__solver,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
36,27.282106,5.024387,0.133753,0.032768,relu,0.05,"(5, 4, 3)",adaptive,500,adam,"{'classifier__activation': 'relu', 'classifier...",0.863212,0.861655,0.857064,0.859537,0.855691,0.859432,0.002787,13


## Fit the best model & export the predictions

In [10]:
clf.set_params(classifier__activation= 'relu', classifier__alpha = 0.15, classifier__hidden_layer_sizes = (8,2), classifier__solver = 'adam', classifier__max_iter = 700, classifier__learning_rate = 'adaptive')

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['occ_code_level2', 'age',
                                                   'stock_dividends',
                                                   'wage_per_hour',
                                                   'capital_losses',
                                                   'own_or_self',
                                                   'ind_code_level2',
                                                   'capital_gains',
                                                   'weeks_worked', 'num_emp',
            

In [11]:
clf.fit(X,y)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer_num',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['occ_code_level2', 'age',
                                                   'stock_dividends',
                                                   'wage_per_hour',
                                                   'capital_losses',
                                                   'own_or_self',
                                                   'ind_code_level2',
                                                   'capital_gains',
                                                   'weeks_worked', 'num_emp',
            

In [12]:
test_id = test_df[["id"]]
test_df.drop("id",axis = 1, inplace = True)

y_test_pred = clf.predict(test_df)

In [13]:
y_test_pred[0:5]

array([0, 0, 0, 1, 0])

In [14]:
nn_8_2 = test_id
nn_8_2

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5
...,...
6063,6064
6064,6065
6065,6066
6066,6067


In [15]:
nn_8_2["high_income"] = y_test_pred
nn_8_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nn_8_2["high_income"] = y_test_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [16]:
nn_8_2.to_csv("Predictions/nn_8_2.csv",index = False, header=True)