In [1]:
%run initial_data_transformation_process.ipynb

In [2]:
from sklearn.linear_model import LogisticRegression

census_features = census_train.drop('income',axis=1)
census_target = census_train['income'].copy()

log_reg = LogisticRegression()
log_reg.fit(census_features, census_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [3]:
predictions = log_reg.predict(census_features)

In [4]:
predictions[0:5]

array([0, 0, 0, 0, 0])

In [5]:
from sklearn.metrics import confusion_matrix
confusion_matrix(census_target,predictions)

array([[18445,  1555],
       [ 1734,  8171]])

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(census_target, predictions)
precision = precision_score(census_target, predictions)
recall = recall_score(census_target, predictions)
f1 = f1_score(census_target, predictions)
print(accuracy,precision, recall, f1)

0.8900183915733155 0.8401192679415999 0.8249369005552751 0.8324588660791605


In [7]:
print(log_reg.coef_)

[[ 0.56778288 -0.57653198  1.55999127  0.26968274  1.10280626  0.052425
   0.33611448  0.90759873 -0.08124466 -0.42988399  0.57370742 -0.28822369
  -0.04286369  0.99977713  0.42691747 -0.59761315 -0.88379945  0.74733354
  -0.64777562 -0.13309701 -0.56197922 -1.31189336  1.58418509 -0.66556081
   0.45481653 -0.11248232 -0.10319967  0.05144038 -0.27146731 -0.60308544
   0.42665336 -0.14679451 -0.48571812 -0.30492801  0.57370742 -0.37843967
  -0.21617961  0.48833138 -0.69833034  0.27524909  0.1129072  -0.01305254
   0.21826195 -0.00986282  0.52965481 -1.06763321  0.00635268 -0.48254269
  -1.16617359  0.56527253  0.74172802  0.13092235 -0.11243321 -0.12826637
  -0.34865263  0.15369241 -0.41768246 -0.28967227  0.38412814 -0.15922238
  -0.63786424  0.47386003  0.0113099  -0.33453649 -0.1342018  -0.185663
  -0.0033618  -0.04395953  0.51446376 -0.06875991  0.56807117  0.23531578
  -0.92270406 -0.31455035  0.19869989  0.01745117 -0.24319964  0.41695556
   0.21774117 -0.47297749  0.85336161  0.2

In [8]:
X_test = census_test.drop('income',axis=1)
Y_test = census_test['income'].copy()

import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(log_reg, random_state=1).fit(X_test, Y_test)
a = eli5.show_weights(perm, feature_names = X_test.columns.tolist(),top=106)

html = a.data
with open('original_permutation_importance.html', 'w') as f:
    f.write(html)


In [9]:
extra_columns_to_drop = ['major industry code_Armed Forces',
'major industry code_Forestry and fisheries',
'worker class_Without pay',
'employment status_Part-time/ Unemployed',
'major industry code_Medical except hospital',
'unemployment reason_Job loser - on layoff',
'major industry code_Finance insurance and real estate',
'marital status_Never married',
'parents-self birth countries_Self US, Parents Other',
'tax filer status_Joint - over 65',
'year_94',
'major industry code_Entertainment',
'major industry code_Agriculture',
'worker class_Federal government',
'major industry code_Public administration',
'major occupation code_Precision production craft & repair',
'major industry code_N/A or children',
'marital status_Married - not together',
'major industry code_Personal services except private HH',
'major industry code_Utilities and sanitary services',
'veterans benefits_Yes',
'major industry code_Wholesale trade',
'veterans benefits_No',
'major occupation code_Armed Forces',
'unemployment reason_New entrant',
'major industry code_Private household services',
'worker class_Never worked',
'major occupation code_Technicians and related support',
'major occupation code_Farming forestry and fishing',
'major industry code_Other professional services',
'lived here 1y ago_No',
'unemployment reason_Re-entrant',
'enrolled/ in education_High school',
'unemployment reason_Job leaver',
'major industry code_Construction',
'unemployment reason_Not In Universe',
'tax filer status_Single',
'enrolled/ in education_Not In Universe',
'major industry code_Manufacturing-durable goods',
'unemployment reason_Other job loser',
'parents-self birth countries_Self Other',
'worker class_Private',
'household status_Other',
'parents-self birth countries_All US',
'major industry code_Manufacturing-nondurable goods',
'major occupation code_Transportation and material moving',
'major industry code_Transportation',
'own business or self-employed_Yes',
'major industry code_Business and repair services',
'worker class_State government',
'major industry code_Communications',
'household status_Spouse of Householder',
'labor union member_Yes',
'tax filer status_Head of household',
'major occupation code_Protective services',
'worker class_Local government']

def drop_further_columns(data):
    data.drop(extra_columns_to_drop,inplace=True,axis=1)
    return data

In [10]:
new_census_features = drop_further_columns(census_features)
new_census_target = census_target

new_log_reg = LogisticRegression()
new_log_reg.fit(new_census_features, new_census_target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
new_predictions = new_log_reg.predict(new_census_features)

In [12]:
new_accuracy = accuracy_score(new_census_target, new_predictions)
new_precision = precision_score(new_census_target, new_predictions)
new_recall = recall_score(new_census_target, new_predictions)
new_f1 = f1_score(new_census_target, new_predictions)
print(new_accuracy, new_precision, new_recall, new_f1)

0.8877445243270357 0.8366234834464322 0.8215042907622413 0.8289949569558351


In [16]:
X_test = census_test.drop('income',axis=1)
X_test = drop_further_columns(X_test)
Y_test = census_test['income'].copy()

import eli5
from eli5.sklearn import PermutationImportance

new_perm = PermutationImportance(new_log_reg, random_state=1).fit(X_test, Y_test)
a = eli5.show_weights(new_perm, feature_names = X_test.columns.tolist(),top=54)

html = a.data
with open('new_50_features_permutation_importance.html', 'w') as f:
    f.write(html)

In [None]:
new_data.info()

In [None]:
# from above we have
#census_train = full_data_scaled.loc[train_index]
#census_test = full_data_scaled.loc[test_index]