In [3]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import RandomizedSearchCV
from mypipes import *
import warnings
warnings.filterwarnings('ignore')

In [4]:
file=r'../data/census_income.csv'

ci_train=pd.read_csv(file)

In [5]:
ci_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
ci_train['Y'].value_counts()

 <=50K    24720
 >50K      7841
Name: Y, dtype: int64

In [7]:
# pd.crosstab(ci_train['education'],ci_train['education.num'])

In [8]:
cat_vars=list(ci_train.select_dtypes(include=['object']).columns)

In [9]:
cat_vars=[_ for _ in cat_vars if _ not in ['Y','education']]

In [10]:
num_vars=list(ci_train.select_dtypes(exclude=['object']).columns)

In [11]:
p1=pdPipeline([
    ('cat_select',VarSelector(cat_vars)),
    ('missing_trt',DataFrameImputer()),
    ('create_dummies',get_dummies_Pipe(300))
])

p2=pdPipeline([
    ('num_select',VarSelector(num_vars)),
    ('missing_trt',DataFrameImputer()),
    ('standardise',pdStdScaler())
])

data_pipe=FeatureUnion([
    ('cat_pipe',p1),
    ('num_pipe',p2)
])

In [12]:
x_train=pd.DataFrame(data=data_pipe.fit_transform(ci_train),
                     columns=data_pipe.get_feature_names())

In [13]:
x_train

Unnamed: 0,cat_pipe__workclass_ Private,cat_pipe__workclass_ Self-emp-not-inc,cat_pipe__workclass_ Local-gov,cat_pipe__workclass_ ?,cat_pipe__workclass_ State-gov,cat_pipe__workclass_ Self-emp-inc,cat_pipe__workclass_ Federal-gov,cat_pipe__marital.status_ Married-civ-spouse,cat_pipe__marital.status_ Never-married,cat_pipe__marital.status_ Divorced,...,cat_pipe__sex_ Male,cat_pipe__native.country_ United-States,cat_pipe__native.country_ Mexico,cat_pipe__native.country_ ?,num_pipe__age,num_pipe__fnlwgt,num_pipe__education.num,num_pipe__capital.gain,num_pipe__capital.loss,num_pipe__hours.per.week
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


In [14]:
ci_train['Y'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [15]:
y_train=(ci_train['Y']==' >50K').astype(int)

In [16]:
y_train

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    1
32558    0
32559    0
32560    1
Name: Y, Length: 32561, dtype: int32

In [17]:
parameters={
'learning_rate': ["constant", "invscaling", "adaptive"],
'hidden_layer_sizes': [(5,10,5),(20,10),(10,20)], # Eg1: for 1st iteration, there will be 3 layers with 5,10,5 neurons each.
'alpha': [0.3,.1,.01],
'activation': ["relu", "logistic", "tanh"]
}

In [19]:
3*3*3*3

81

In [18]:
clf=MLPClassifier()

In [20]:
random_search=RandomizedSearchCV(clf,n_iter=5,cv=10,
                                 param_distributions=parameters,
                                 scoring='roc_auc',random_state=2,
                                 n_jobs=-1,verbose=20)

In [21]:
random_search.fit(x_train,y_train)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


RandomizedSearchCV(cv=10, estimator=MLPClassifier(), n_iter=5, n_jobs=-1,
                   param_distributions={'activation': ['relu', 'logistic',
                                                       'tanh'],
                                        'alpha': [0.3, 0.1, 0.01],
                                        'hidden_layer_sizes': [(5, 10, 5),
                                                               (20, 10),
                                                               (10, 20)],
                                        'learning_rate': ['constant',
                                                          'invscaling',
                                                          'adaptive']},
                   random_state=2, scoring='roc_auc', verbose=20)

In [22]:
#New
random_search.best_estimator_

MLPClassifier(activation='tanh', alpha=0.01, hidden_layer_sizes=(5, 10, 5),
              learning_rate='adaptive')

In [17]:
#old
random_search.best_estimator_

MLPClassifier(activation='tanh', alpha=0.01, hidden_layer_sizes=(5, 10, 5),
              learning_rate='adaptive')

In [18]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [19]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.915 (std: 0.00490)
Parameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (5, 10, 5), 'alpha': 0.01, 'activation': 'tanh'}

Model with rank: 2
Mean validation score: 0.915 (std: 0.00357)
Parameters: {'learning_rate': 'constant', 'hidden_layer_sizes': (10, 20), 'alpha': 0.01, 'activation': 'tanh'}

Model with rank: 3
Mean validation score: 0.913 (std: 0.00448)
Parameters: {'learning_rate': 'invscaling', 'hidden_layer_sizes': (10, 20), 'alpha': 0.01, 'activation': 'relu'}

Model with rank: 4
Mean validation score: 0.907 (std: 0.00553)
Parameters: {'learning_rate': 'adaptive', 'hidden_layer_sizes': (5, 10, 5), 'alpha': 0.3, 'activation': 'logistic'}

Model with rank: 5
Mean validation score: 0.906 (std: 0.00522)
Parameters: {'learning_rate': 'constant', 'hidden_layer_sizes': (20, 10), 'alpha': 0.3, 'activation': 'logistic'}



In [20]:
mlp=random_search.best_estimator_

In [21]:
mlp.fit(x_train,y_train)

MLPClassifier(activation='tanh', alpha=0.01, hidden_layer_sizes=(5, 10, 5),
              learning_rate='adaptive')

In [22]:
mlp.intercepts_

[array([-0.01761839, -0.50810631,  0.35866112, -0.07217437,  0.23825087]),
 array([ 0.31933245,  0.57363651, -0.18586545,  0.18358137,  0.42466708,
        -0.71006331, -0.65688855,  0.21392093, -0.73016228,  0.50963995]),
 array([ 1.06546745,  0.28243266,  0.04738347, -0.04171225, -0.46478601]),
 array([-0.13160139])]

In [23]:
mlp.predict_proba(x_train)

array([[0.9553598 , 0.0446402 ],
       [0.58090644, 0.41909356],
       [0.97836725, 0.02163275],
       ...,
       [0.97570115, 0.02429885],
       [0.99887656, 0.00112344],
       [0.00611386, 0.99388614]])

In [24]:
mlp.classes_

array([0, 1])

In [25]:
mlp.predict_proba(x_train)[:,1]

array([0.0446402 , 0.41909356, 0.02163275, ..., 0.02429885, 0.00112344,
       0.99388614])