In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, \
GradientBoostingClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
import time
import warnings

warnings.simplefilter(action='ignore')
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [2]:
data = pd.read_csv('../data/large_train_sample.csv')
test_data = pd.read_csv('../data/test_eda.csv')

In [3]:
test_data.shape

(16281, 13)

In [4]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
wage              0
dtype: int64

In [5]:
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'wage'],
      dtype='object')

In [6]:
data['wage_c'] = data['wage'].str.strip().map({
    '<=50K' : 0,
    '>50K' : 1
})

### workclass to numeric

In [7]:
data.groupby('workclass')['wage_c'].mean().sort_values(ascending = False)

workclass
 Self-emp-inc        0.557348
 Federal-gov         0.386458
 Local-gov           0.294792
 Self-emp-not-inc    0.284927
 State-gov           0.271957
 Private             0.218673
 ?                   0.104031
 Without-pay         0.000000
 Never-worked        0.000000
Name: wage_c, dtype: float64

In [8]:
data['workclass'] = data['workclass'].str.strip().map({
    'Never-worked' : 0,
    'Without-pay' : 0,
    '?' : 1,
    'Private' : 2,
    'State-gov' : 3,
    'Self-emp-not-inc' : 4,
    'Local-gov' : 5,
    'Federal-gov' : 6,
    'Self-emp-inc' : 7
})

### education to numeric

In [9]:
data.groupby('education')['wage_c'].mean().sort_values(ascending = False)

education
 Doctorate       0.740920
 Prof-school     0.734375
 Masters         0.556587
 Bachelors       0.414753
 Assoc-voc       0.261216
 Assoc-acdm      0.248360
 Some-college    0.190235
 HS-grad         0.159509
 12th            0.076212
 10th            0.066452
 7th-8th         0.061920
 9th             0.052529
 11th            0.051064
 5th-6th         0.048048
 1st-4th         0.035714
 Preschool       0.000000
Name: wage_c, dtype: float64

In [10]:
data['education'] = data['education'].str.strip().map({
    'Preschool' : 0,
    '1st-4th' : 1,
    '5th-6th' : 2,
    '11th' : 3,
    '9th' : 4,
    '7th-8th' : 5,
    '10th' : 6,
    '12th' : 7,
    'HS-grad': 8,
    'Some-college' : 9,
    'Assoc-acdm' : 10,
    'Assoc-voc' : 11,
    'Bachelors' : 12,
    'Masters' : 13,
    'Prof-school' : 14,
    'Doctorate' : 15
})

## marital-status

In [11]:
data.groupby('marital-status')['wage_c'].mean().sort_values(ascending = False)

marital-status
 Married-civ-spouse       0.446848
 Married-AF-spouse        0.434783
 Divorced                 0.104209
 Widowed                  0.085599
 Married-spouse-absent    0.081340
 Separated                0.064390
 Never-married            0.045961
Name: wage_c, dtype: float64

In [12]:
data['marital-status'] = data['marital-status'].str.strip().map({
    'Never-married' : 0,
    'Separated' : 1,
    'Married-spouse-absent' : 2,
    'Widowed': 3,
    'Divorced' : 4,
    'Married-AF-spouse' : 5,
    'Married-civ-spouse' : 6,
})

## occupation

In [13]:
data.groupby('occupation')['wage_c'].mean().sort_values(ascending = False)

occupation
 Exec-managerial      0.484014
 Prof-specialty       0.449034
 Protective-serv      0.325116
 Tech-support         0.304957
 Sales                0.269315
 Craft-repair         0.226641
 Transport-moving     0.200376
 Adm-clerical         0.134483
 Machine-op-inspct    0.124875
 Farming-fishing      0.115694
 Armed-Forces         0.111111
 ?                    0.103635
 Handlers-cleaners    0.062774
 Other-service        0.041578
 Priv-house-serv      0.006711
Name: wage_c, dtype: float64

In [14]:
data['occupation'] = data['occupation'].str.strip().map({
    'Priv-house-serv' : 0,
    'Other-service' : 1,
    'Handlers-cleaners' : 2,
    '?': 3,
    'Armed-Forces' : 4,
    'Farming-fishing' : 5,
    'Machine-op-inspct' : 6,
    'Adm-clerical' : 7,
    'Transport-moving' : 8,
    'Craft-repair' : 9,
    'Sales' : 10,
    'Tech-support' : 11,
    'Protective-serv' : 12,
    'Prof-specialty' : 13,
    'Exec-managerial' :14
})

## relationship

In [15]:
data.groupby('relationship')['wage_c'].mean().sort_values(ascending = False)

relationship
 Wife              0.475128
 Husband           0.448571
 Not-in-family     0.103070
 Unmarried         0.063262
 Other-relative    0.037717
 Own-child         0.013220
Name: wage_c, dtype: float64

In [16]:
data['relationship'] = data['relationship'].str.strip().map({
    'Own-child' : 0,
    'Other-relative' : 1,
    'Unmarried' : 2,
    'Not-in-family' : 3,
    'Husband' : 4,
    'Wife' :5
})

## native-country

In [17]:
data.groupby('native-country')['wage_c'].mean().sort_values(ascending = False)

native-country
 Iran                          0.418605
 France                        0.413793
 India                         0.400000
 Taiwan                        0.392157
 Japan                         0.387097
 Yugoslavia                    0.375000
 Cambodia                      0.368421
 Italy                         0.342466
 England                       0.333333
 Canada                        0.322314
 Germany                       0.321168
 Philippines                   0.308081
 Hong                          0.300000
 Greece                        0.275862
 China                         0.266667
 Cuba                          0.263158
 ?                             0.250429
 Scotland                      0.250000
 United-States                 0.245835
 Hungary                       0.230769
 Ireland                       0.208333
 South                         0.200000
 Poland                        0.200000
 Thailand                      0.166667
 Ecuador                 

In [18]:
data['native-country'] = data['native-country'].str.strip().map({
    'Holand-Netherlands' : 0,
    'Outlying-US(Guam-USVI-etc)' : 1,
    'Dominican-Republic' : 2,
    'Columbia' : 3,
    'Guatemala' : 4,
    'Mexico' : 5,
    'Nicaragua' : 6,
    'Peru' : 7,
    'Vietnam' : 8,
    'Honduras' : 9,
    'El-Salvador' : 10,
    'Haiti' : 11,
    'Puerto-Rico' :12,
    'Trinadad&Tobago' : 13,
    'Portugal' : 14,
    'Laos' : 15,
    'Jamaica' : 16,
    'Ecuador' : 17,
    'Thailand' : 18,
    'Poland' : 19,
    'South' : 20,
    'Ireland' : 21,
    'Hungary' : 22,
    'United-States' : 23,
    'Scotland' : 24,
    '?' : 25,
    'Cuba' : 26,
    'China' : 27,
    'Greece' : 28,
    'Hong' : 29,
    'Philippines' : 30,
    'Germany' : 31,
    'Canada' : 32,
    'England' : 33,
    'Italy' : 34,
    'Cambodia' : 35,
    'Yugoslavia' : 36,
    'Japan' :37,
    'Taiwan' :38,
    'India' : 39,
    'France' :40,
    'Iran' : 41
})                
                               

In [19]:
data['education_com'] = data['education'] * data['education-num']

In [20]:
data.corr()['wage_c']

age               0.234037
workclass         0.164816
fnlwgt           -0.009463
education         0.333231
education-num     0.335154
marital-status    0.411532
occupation        0.336541
relationship      0.384144
capital-gain      0.223329
capital-loss      0.150526
hours-per-week    0.229689
native-country    0.097889
wage_c            1.000000
education_com     0.358760
Name: wage_c, dtype: float64

In [21]:
feature = [data.corr()['wage_c'].index[i] for i in range(len(data.corr()['wage_c'])) if abs(data.corr()['wage_c'][i])>0.15]
feature

['age',
 'workclass',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'wage_c',
 'education_com']

In [22]:
X = data[feature].drop(columns = 'wage_c')
y = data['wage_c']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 42)

## baseline

In [23]:
data['wage_c'].value_counts()

0    24720
1     7841
Name: wage_c, dtype: int64

In [24]:
data['wage_c'].value_counts(normalize=True)

0    0.75919
1    0.24081
Name: wage_c, dtype: float64

# Model

## Adaboost

In [25]:
ada = AdaBoostClassifier()
param = {
    'n_estimators' : [250,500]
}

grid = GridSearchCV(ada,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=AdaBoostClassifier(algorithm='SAMME.R',
                                          base_estimator=None,
                                          learning_rate=1.0, n_estimators=50,
                                          random_state=None),
             iid='warn', n_jobs=4, param_grid={'n_estimators': [250, 500]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
grid.best_params_

{'n_estimators': 500}

In [27]:
ada_train = grid.score(X_train, y_train)

In [28]:
ada_test = grid.score(X_test, y_test)

In [29]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
ada_roc = roc_auc_score(y_test, y_score_proba)

In [30]:
ada_roc

0.9270267425850779

## Gradient Boost

In [31]:
grad = GradientBoostingClassifier()
param = {
    'n_estimators' : [200, 500]
}

grid = GridSearchCV(grad,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GradientBoostingClassifier(criterion='friedman_mse',
                                                  init=None, learning_rate=0.1,
                                                  loss='deviance', max_depth=3,
                                                  max_features=None,
                                                  max_leaf_nodes=None,
                                                  min_impurity_decrease=0.0,
                                                  min_impurity_split=None,
                                                  min_samples_leaf=1,
                                                  min_samples_split=2,
                                                  min_weight_fraction_leaf=0.0,
                                                  n_estimators=100,
                                                  n_iter_no_change=None,
                                                  presort=

In [32]:
grid.best_params_

{'n_estimators': 200}

In [33]:
grad_train = grid.score(X_train, y_train)

In [34]:
grad_test = grid.score(X_test, y_test)

In [35]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
grad_roc = roc_auc_score(y_test, y_score_proba)

In [36]:
grad_roc

0.9293874579438637

## bernoulliNB

In [37]:
ss = StandardScaler()
bern = BernoulliNB()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

param = {
    'alpha' : [0.1, 1, 10, 100]
}

grid = GridSearchCV(bern,param, cv=5, n_jobs = 4)
grid.fit(X_train_sc, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                                   fit_prior=True),
             iid='warn', n_jobs=4, param_grid={'alpha': [0.1, 1, 10, 100]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [38]:
grid.best_params_

{'alpha': 10}

In [39]:
bern_train = grid.score(X_train_sc, y_train)

In [40]:
bern_test = grid.score(X_test_sc, y_test)

In [41]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
bern_roc = roc_auc_score(y_test, y_score_proba)

## Gaussian

In [42]:
gau = GaussianNB()
param = {}

grid = GridSearchCV(gau,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=GaussianNB(priors=None, var_smoothing=1e-09), iid='warn',
             n_jobs=4, param_grid={}, pre_dispatch='2*n_jobs', refit=True,
             return_train_score=False, scoring=None, verbose=0)

In [43]:
gau_train = grid.score(X_train, y_train)

In [44]:
gau_test = grid.score(X_test, y_test)

In [45]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
gau_roc = roc_auc_score(y_test, y_score_proba)

## knn

In [46]:
knn = KNeighborsClassifier()
param = {
    'n_neighbors' : [3,5,10]
}

grid = GridSearchCV(knn,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=4, param_grid={'n_neighbors': [3, 5, 10]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [47]:
grid.best_params_

{'n_neighbors': 10}

In [48]:
knn_train = grid.score(X_train, y_train)

In [49]:
knn_test = grid.score(X_test,y_test)

In [50]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
knn_roc = roc_auc_score(y_test, y_score_proba)

## logistic regression

In [51]:
log = LogisticRegression()
param = {
    'penalty': ['l1','l2'],
    'C' : [0.1,1,10]
}
grid = GridSearchCV(log,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=4,
             param_grid={'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [52]:
grid.best_params_

{'C': 1, 'penalty': 'l1'}

In [53]:
log_train = grid.score(X_train,y_train)

In [54]:
log_test = grid.score(X_test,y_test)

In [55]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
log_roc = roc_auc_score(y_test, y_score_proba)

## Random Forest

In [56]:
rf = RandomForestClassifier()
param = {}
grid = GridSearchCV(rf,param, cv=5, n_jobs = 4)
grid.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [57]:
rf_train = grid.score(X_train,y_train)

In [58]:
rf_test = grid.score(X_test,y_test)

In [59]:
y_score_proba = [i[1] for i in grid.predict_proba(X_test)]
rf_roc = roc_auc_score(y_test, y_score_proba)

# voting classifier

In [60]:
bern_pipe = Pipeline([
   ('ss', StandardScaler()),
    ('bern' , BernoulliNB())
])

vc = VotingClassifier([
    ('ada' , AdaBoostClassifier()),
    ('grad' , GradientBoostingClassifier()),
#    ('gau' , GaussianNB()),
#    ('bern1',bern_pipe),
#    ('knn' , KNeighborsClassifier()),
#    ('log' , LogisticRegression()),
#    ('rf' , RandomForestClassifier()),
#    ('ec' , ExtraTreesClassifier()),
],n_jobs = 4, voting = 'soft')


params = {
    'ada__n_estimators' : [250,500],
    'grad__n_estimators' : [500],
}


In [61]:
gs = GridSearchCV(vc, params, cv = 3)
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=VotingClassifier(estimators=[('ada',
                                                     AdaBoostClassifier(algorithm='SAMME.R',
                                                                        base_estimator=None,
                                                                        learning_rate=1.0,
                                                                        n_estimators=50,
                                                                        random_state=None)),
                                                    ('grad',
                                                     GradientBoostingClassifier(criterion='friedman_mse',
                                                                                init=None,
                                                                                learning_rate=0.1,
                                                                        

In [62]:
vc_train = gs.score(X_train, y_train)

In [63]:
vc_test = gs.score(X_test, y_test)

In [64]:
y_score_proba = [i[1] for i in gs.predict_proba(X_test)]
vc_roc = roc_auc_score(y_test, y_score_proba)

In [65]:
submit_chicago = y_score_proba = [i[1] for i in gs.predict_proba(X_test)]

In [66]:
gs.best_params_

{'ada__n_estimators': 250, 'grad__n_estimators': 500}

In [67]:
vc_roc

0.9309068442131747

# Comparison

In [86]:
print (f'ada accuracy: {ada_train},{ada_test}')
print (f'gradient accuracy: {grad_train},{grad_test}')
print (f'bernoulli accuracy: {bern_train},{bern_test}')
print (f'gaussian accuracy: {gau_train},{gau_test}')
print (f'knn accuracy: {knn_train},{knn_test}')
print (f'log accuracy: {log_train},{log_test}')
print (f'Random Forest accuracy: {rf_train},{rf_test}')
print (f'voting accuracy: {vc_train},{vc_test}')

ada accuracy: 0.8701064701064701,0.869672030463088
gradient accuracy: 0.8764127764127764,0.8716373909839086
bernoulli accuracy: 0.7932022932022932,0.799901731973959
gaussian accuracy: 0.8378787878787879,0.8377349219997543
knn accuracy: 0.8759623259623259,0.8555460017196904
log accuracy: 0.847051597051597,0.8507554354501904
Random Forest accuracy: 0.963963963963964,0.8439995086598698
voting accuracy: 0.8828419328419328,0.8763051222208574


In [87]:
print (f'ada roc:{ada_roc}')
print (f'gradient roc:{grad_roc}')
print (f'bernoulli roc:{bern_roc}')
print (f'gaussian roc:{gau_roc}')
print (f'knn orc:{knn_roc}')
print (f'log roc:{log_roc}')
print (f'Random Forest roc:{rf_roc}')
print (f'voting classifier roc:{vc_roc}')

ada roc:0.9270267425850779
gradient roc:0.9293874579438637
bernoulli roc:0.7580768005309226
gaussian roc:0.8915853471302775
knn orc:0.9003003773908853
log roc:0.9034609022382614
Random Forest roc:0.8800201159577242
voting classifier roc:0.9309068442131747


# Test

In [71]:
bern_pipe = Pipeline([
   ('ss', StandardScaler()),
    ('bern' , BernoulliNB())
])

vc = VotingClassifier([
    ('ada' , AdaBoostClassifier()),
    ('grad' , GradientBoostingClassifier()),
#    ('gau' , GaussianNB()),
#    ('bern1',bern_pipe),
#    ('knn' , KNeighborsClassifier()),
#    ('log' , LogisticRegression()),
#    ('rf' , RandomForestClassifier()),
#    ('ec' , ExtraTreesClassifier()),
],n_jobs = 4, voting = 'soft')


params = {
    'ada__n_estimators' : [250,500],
    'grad__n_estimators' : [500],
}

gs = GridSearchCV(vc, params, cv = 3)
gs.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=VotingClassifier(estimators=[('ada',
                                                     AdaBoostClassifier(algorithm='SAMME.R',
                                                                        base_estimator=None,
                                                                        learning_rate=1.0,
                                                                        n_estimators=50,
                                                                        random_state=None)),
                                                    ('grad',
                                                     GradientBoostingClassifier(criterion='friedman_mse',
                                                                                init=None,
                                                                                learning_rate=0.1,
                                                                        

In [72]:
test_data['workclass'] = test_data['workclass'].str.strip().map({
    'Never-worked' : 0,
    'Without-pay' : 0,
    '?' : 1,
    'Private' : 2,
    'State-gov' : 3,
    'Self-emp-not-inc' : 4,
    'Local-gov' : 5,
    'Federal-gov' : 6,
    'Self-emp-inc' : 7
})

In [73]:
test_data['education'] = test_data['education'].str.strip().map({
    'Preschool' : 0,
    '1st-4th' : 1,
    '5th-6th' : 2,
    '11th' : 3,
    '9th' : 4,
    '7th-8th' : 5,
    '10th' : 6,
    '12th' : 7,
    'HS-grad': 8,
    'Some-college' : 9,
    'Assoc-acdm' : 10,
    'Assoc-voc' : 11,
    'Bachelors' : 12,
    'Masters' : 13,
    'Prof-school' : 14,
    'Doctorate' : 15
})

In [74]:
test_data['marital-status'] = test_data['marital-status'].str.strip().map({
    'Never-married' : 0,
    'Separated' : 1,
    'Married-spouse-absent' : 2,
    'Widowed': 3,
    'Divorced' : 4,
    'Married-AF-spouse' : 5,
    'Married-civ-spouse' : 6,
})

In [75]:
test_data['occupation'] = test_data['occupation'].str.strip().map({
    'Priv-house-serv' : 0,
    'Other-service' : 1,
    'Handlers-cleaners' : 2,
    '?': 3,
    'Armed-Forces' : 4,
    'Farming-fishing' : 5,
    'Machine-op-inspct' : 6,
    'Adm-clerical' : 7,
    'Transport-moving' : 8,
    'Craft-repair' : 9,
    'Sales' : 10,
    'Tech-support' : 11,
    'Protective-serv' : 12,
    'Prof-specialty' : 13,
    'Exec-managerial' :14
})

In [76]:
test_data['relationship'] = test_data['relationship'].str.strip().map({
    'Own-child' : 0,
    'Other-relative' : 1,
    'Unmarried' : 2,
    'Not-in-family' : 3,
    'Husband' : 4,
    'Wife' :5
})

In [77]:
test_data['native-country'] = test_data['native-country'].str.strip().map({
    'Holand-Netherlands' : 0,
    'Outlying-US(Guam-USVI-etc)' : 1,
    'Dominican-Republic' : 2,
    'Columbia' : 3,
    'Guatemala' : 4,
    'Mexico' : 5,
    'Nicaragua' : 6,
    'Peru' : 7,
    'Vietnam' : 8,
    'Honduras' : 9,
    'El-Salvador' : 10,
    'Haiti' : 11,
    'Puerto-Rico' :12,
    'Trinadad&Tobago' : 13,
    'Portugal' : 14,
    'Laos' : 15,
    'Jamaica' : 16,
    'Ecuador' : 17,
    'Thailand' : 18,
    'Poland' : 19,
    'South' : 20,
    'Ireland' : 21,
    'Hungary' : 22,
    'United-States' : 23,
    'Scotland' : 24,
    '?' : 25,
    'Cuba' : 26,
    'China' : 27,
    'Greece' : 28,
    'Hong' : 29,
    'Philippines' : 30,
    'Germany' : 31,
    'Canada' : 32,
    'England' : 33,
    'Italy' : 34,
    'Cambodia' : 35,
    'Yugoslavia' : 36,
    'Japan' :37,
    'Taiwan' :38,
    'India' : 39,
    'France' :40,
    'Iran' : 41
})                

In [78]:
test_data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In [79]:
test_data['education_com'] = test_data['education'] * test_data['education-num']

In [80]:
new_feature = feature

In [81]:
new_feature.remove('wage_c')

In [82]:
new_feature

['age',
 'workclass',
 'education',
 'education-num',
 'marital-status',
 'occupation',
 'relationship',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'education_com']

In [83]:
test_data['wage'] = [i[1] for i in gs.predict_proba(test_data[new_feature])]

In [84]:
submit = pd.DataFrame(test_data['wage'])

In [85]:
submit.to_csv('../data/submit.csv', index = False)