In [437]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, \
GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import time
import warnings

warnings.simplefilter(action='ignore')
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [519]:
data = pd.read_csv('../data/train_eda.csv')
test_data = pd.read_csv('../data/test_eda.csv')

#feature =[column for column in data._get_numeric_data().columns]
feature = [data.corr()['wage_c'].index[i] for i in range(len(data.corr()['wage_c'])) if abs(data.corr()['wage_c'][i])>0.10]

In [520]:
feature

['age',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'wage_c',
 'workclass_ Self-emp-inc',
 'education_ Bachelors',
 'education_ Doctorate',
 'education_ HS-grad',
 'education_ Masters',
 'education_ Prof-school',
 'marital-status_ Married-civ-spouse',
 'marital-status_ Never-married',
 'occupation_ Exec-managerial',
 'occupation_ Other-service',
 'occupation_ Prof-specialty',
 'relationship_ Not-in-family',
 'relationship_ Own-child',
 'relationship_ Unmarried',
 'relationship_ Wife',
 'sex_ Male',
 'wage_ >50K']

In [521]:
len(feature)

23

In [441]:
X = data[feature].drop(columns = ['wage_ >50K','wage_c'])
y = data['wage_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

## baseline

In [442]:
data['wage_c'].value_counts()

0    24720
1     7841
Name: wage_c, dtype: int64

In [443]:
data['wage_c'].value_counts(normalize=True)

0    0.75919
1    0.24081
Name: wage_c, dtype: float64

## Adaboost

In [444]:
ada = AdaBoostClassifier()
param = {
    'n_estimators' : [100,150,200,250]
}

grid = GridSearchCV(ada,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=4,
             param_grid={'n_estimators': [100, 150, 200, 250]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [445]:
ada_train = grid.score(X_train, y_train)

In [446]:
ada_test = grid.score(X_test, y_test)

In [447]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
ada_roc = roc_auc_score(y_test, y_score_proba)

## Gradient Boost

In [448]:
grad = GradientBoostingClassifier()
param = {
    'n_estimators' : [100, 150, 200, 50]
}

grid = GridSearchCV(grad,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort=

In [449]:
grad_train = grid.score(X_train, y_train)

In [450]:
grad_test = grid.score(X_test, y_test)

In [451]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
grad_roc = roc_auc_score(y_test, y_score_proba)

## bernoulliNB

In [452]:
ss = StandardScaler()
bern = BernoulliNB()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

param = {
    'alpha' : [0.1, 1, 10, 100]
}

grid = GridSearchCV(bern,param, cv=5, n_jobs = 4)
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='warn', n_jobs=4, param_grid={'alpha': [0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [453]:
bern_train = grid.score(X_train_sc, y_train)

In [454]:
bern_test = grid.score(X_test_sc, y_test)

In [455]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
bern_roc = roc_auc_score(y_test, y_score_proba)

## Gaussian

In [456]:
gau = GaussianNB()
param = {}

grid = GridSearchCV(gau,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GaussianNB(priors=None, var_smoothing=1e-09), iid='warn',
             n_jobs=4, param_grid={}, pre_dispatch='2*n_jobs', refit=True,
             return_train_score=False, scoring=None, verbose=0)

In [457]:
gau_train = grid.score(X_train, y_train)

In [458]:
gau_test = grid.score(X_test, y_test)

In [459]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
gau_roc = roc_auc_score(y_test, y_score_proba)

## knn

In [460]:
knn = KNeighborsClassifier()
param = {
    'n_neighbors' : [3,5,10]
}

grid = GridSearchCV(knn,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=4, param_grid={'n_neighbors': [3, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [461]:
knn_train = grid.score(X_train, y_train)

In [462]:
knn_test = grid.score(X_test,y_test)

In [463]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
knn_roc = roc_auc_score(y_test, y_score_proba)

## logistic regression

In [464]:
log = LogisticRegression()
param = {
    'penalty': ['l1','l2'],
    'C' : [0.1,1,10]
}
grid = GridSearchCV(log,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=4,
             param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [465]:
log_train = grid.score(X_train,y_train)

In [466]:
log_test = grid.score(X_test,y_test)

In [467]:
pred = grid.predict(X_test)
log_roc = roc_auc_score(y_test,pred)

In [468]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
log_roc = roc_auc_score(y_test, y_score_proba)

## Random Forest

In [469]:
rf = RandomForestClassifier()
param = {}
grid = GridSearchCV(rf,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [470]:
rf_train = grid.score(X_train,y_train)

In [471]:
rf_test = grid.score(X_test,y_test)

In [472]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
rf_roc = roc_auc_score(y_test, y_score_proba)

## SVC

In [473]:
svc = SVC()
param = {}
grid = GridSearchCV(svc,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

In [474]:
svc_train = grid.score(X_train,y_train)

In [475]:
svc_test = grid.score(X_test,y_test)

In [476]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
svc_roc = roc_auc_score(y_test, y_score_proba)

# voting classifier

In [522]:
bern_pipe = Pipeline([
   ('ss', StandardScaler()),
    ('bern' , BernoulliNB())
])

vc = VotingClassifier([
    ('ada' , AdaBoostClassifier()),
    ('grad' , GradientBoostingClassifier()),
#    ('gau' , GaussianNB()),
#    ('bern1',bern_pipe),
#    ('knn' , KNeighborsClassifier()),
    ('log' , LogisticRegression()),
#    ('rf' , RandomForestClassifier()),
#    ('ec' , ExtraTreesClassifier()),
#    ('svc' , SVC(gamma='scale')),
],n_jobs = 4, voting = 'soft')


params = {
    'ada__n_estimators' : [250,500],
    'grad__n_estimators' : [500],
}


In [523]:
gs = GridSearchCV(vc, params, cv = 3)
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=VotingClassifier(estimators=[('ada',
                                                     AdaBoostClassifier(algorithm='SAMME.R',
                                                                        base_estimator=None,
                                                                        learning_rate=1.0,
                                                                        n_estimators=50,
                                                                        random_state=None)),
                                                    ('grad',
                                                     GradientBoostingClassifier(criterion='friedman_mse',
                                                                                init=None,
                                                                                learning_rate=0.1,
                                                                        

In [524]:
vc_train = gs.score(X_train, y_train)

In [525]:
vc_test = gs.score(X_test, y_test)

In [526]:
y_score_proba = [i[1] for i in gs.predict_proba(X_test)]
vc_roc = roc_auc_score(y_test, y_score_proba)

In [527]:
vc_roc

0.9198805836846954

# Comparison

In [517]:
print (f'ada accuracy: {ada_train},{ada_test}')
print (f'gradient accuracy: {grad_train},{grad_test}')
print (f'bernoulli accuracy: {bern_train},{bern_test}')
print (f'gaussian accuracy: {gau_train},{gau_test}')
print (f'knn accuracy: {knn_train},{knn_test}')
print (f'log accuracy: {log_train},{log_test}')
print (f'Random Forest accuracy: {rf_train},{rf_test}')
print (f'SVC accuracy: {svc_train},{svc_test}')
print (f'voting accuracy: {vc_train},{vc_test}')

ada: 0.8652743652743653,0.8635302788355239
gradient: 0.8707616707616708,0.8647586291610367
bernoulli: 0.821990171990172,0.8196781722147156
gaussian: 0.8384520884520884,0.8408057978135364
knn: 0.8721539721539722,0.851001105515293
log: 0.8466011466011466,0.8489129099619211
Random Forest: 0.9381244881244881,0.8388404372927158
SVC: 0.8774774774774775,0.8498955902223314
voting: 0.8609746109746109,0.8652499692912419


In [518]:
print (f'ada roc:{ada_roc}')
print (f'gradient roc:{grad_roc}')
print (f'bernoulli roc:{bern_roc}')
print (f'gaussian roc:{gau_roc}')
print (f'knn roc:{knn_roc}')
print (f'log roc:{log_roc}')
print (f'Random Forest roc:{rf_roc}')
print (f'SVC roc:{svc_roc}')
print (f'voting classifier roc:{vc_roc}')

ada:0.9231514697773624
gradient:0.9258148324853319
bernoulli:0.8711308767156758
gaussian:0.894890488957272
knn:0.894636171083868
log:0.8995495990015485
Random Forest:0.8716029454978886
SVC:0.7405280005546953
voting classifier:0.9148112302678716
