# Logistic regression  for Census dataset

In [217]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import pandas_profiling

import seaborn as sns
import plotly
%matplotlib inline
import plotly.plotly as py
import matplotlib.pyplot as plt
from matplotlib import style

In [218]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

In [219]:
adult_train = pd.read_csv('adult_train.csv', header = None)
adult_test  = pd.read_csv('adult_test.csv', header = None)
adult_train.columns = cols
adult_test.columns = cols
adult_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Strategy 1

In [220]:
# remove whitespaces from the beginning of categorical values
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [221]:
adult_train.head(4)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K


In [222]:
adult_train["workclass"].value_counts()


Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [223]:
print(len(adult_train))
adult_train = adult_train[adult_train["workclass"]  != "?"]
adult_test = adult_test[adult_test["workclass"]  != "?"]
len(adult_train)

32561


30725

In [224]:
adult_train["occupation"].value_counts()


Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
?                       7
Name: occupation, dtype: int64

In [225]:
adult_train = adult_train[adult_train["occupation"] != "Other-service"]
adult_test = adult_test[adult_test["occupation"] != "Other-service"]
len(adult_train)

27430

In [226]:
adult_train["native_country"].value_counts()


United-States                 24734
Mexico                          487
?                               473
Philippines                     152
Germany                         121
India                            96
Canada                           95
Puerto-Rico                      93
England                          79
Cuba                             79
South                            60
Jamaica                          60
El-Salvador                      60
Italy                            58
Guatemala                        56
Vietnam                          55
Dominican-Republic               55
China                            52
Japan                            50
Poland                           50
Columbia                         48
Taiwan                           41
Iran                             38
Portugal                         32
Nicaragua                        27
Haiti                            26
France                           25
Greece                      

In [227]:
print(len(adult_train))
adult_train = adult_train[adult_train["native_country"]  != "?"]
adult_test = adult_test[adult_test["native_country"]  != "?"]
len(adult_train)

27430


26957

In [228]:
def logisticRegressionHelper(adult_train, adult_test):
    train = pd.get_dummies(adult_train, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
    test  = pd.get_dummies(adult_test, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
    
    # transform y
    train.loc[train['y'] == '>50K', 'y'] = 1
    train.loc[train['y'] == '<=50K', 'y'] = 0

    test.loc[test['y'] == '>50K.', 'y'] = 1
    test.loc[test['y'] == '<=50K.', 'y'] = 0
    
    test['native_country_Holand-Netherlands'] = 0
    
    if len(set(train.columns).difference(set(test.columns))) != 0:
        print("We miss suc columns {}".format(set(train.columns).difference(set(test.columns))))
        return
    
    y_train = train['y']
    x_train = train.loc[:, train.columns != 'y']
    
    y_test = test['y']
    x_test = test.loc[:, test.columns != 'y']
    
    model = LogisticRegression(solver='lbfgs', max_iter = 1000)
    model.fit(x_train, y_train)
    
    print("Training accuracy: {}".format(model.score(x_train, y_train)))

    predicted = model.predict(x_test)
    first_accuracy = [model.score(x_train, y_train), metrics.accuracy_score(y_test, predicted)]
    
    print("Testing accuracy:  {}".format(first_accuracy[-1]))
    
    return first_accuracy

In [229]:
first_accuracy = logisticRegressionHelper(adult_train, adult_test)
first_accuracy

Training accuracy: 0.77171050191045
Testing accuracy:  0.7742630132917502


[0.77171050191045, 0.7742630132917502]

The most drawback is that we remove part of data, where we have some values, from which our model can learn something. Using this dataset it wasn't so painful as if we used a smaller one.


## Strategy 2

In [233]:
adult_train = pd.read_csv('adult_train.csv', header = None)
adult_test  = pd.read_csv('adult_test.csv', header = None)
adult_train.columns = cols
adult_test.columns = cols

In [234]:
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [235]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'mean')
adult_test[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_gain', 'capital_loss']])
adult_train[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_gain', 'capital_loss']])

In [158]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

In [236]:
second_accuracy = logisticRegressionHelper(adult_train, adult_test)
second_accuracy

Training accuracy: 0.7561807069807438
Testing accuracy:  0.7583072292856704


[0.7561807069807438, 0.7583072292856704]

We can see that this strategy work worse than previous one, but for real life it is more suitable. As our previous data was too beautiful and we have smaller segment of all people dataset provide.

We create very similar data, for model it is bad. As it may learn only that features that often appear.
This may be useful when small amount of data missed ( < 10%), we missed near 17%.  

## Missing value imputation

In [156]:
# remove whitespaces from the beginning of categorical values
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [157]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
adult_test[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_gain', 'capital_loss']])
adult_train[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_gain', 'capital_loss']])

In [158]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

## Transformation of categorical values to numerical ones

In [159]:
train = pd.get_dummies(adult_train, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
test  = pd.get_dummies(adult_test, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])

In [160]:
train.head(5)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,y,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,39,77516,13,2174.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,7298.0,1887.0,13,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,7298.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,7298.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,7298.0,1887.0,40,<=50K,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
# transform y

train.loc[train['y'] == '>50K', 'y'] = 1
train.loc[train['y'] == '<=50K', 'y'] = 0

test.loc[test['y'] == '>50K.', 'y'] = 1
test.loc[test['y'] == '<=50K.', 'y'] = 0

In [162]:
# add missing column to test dataset
test['native_country_Holand-Netherlands'] = 0

In [163]:
# now no difference
set(train.columns).difference(set(test.columns))

set()

## Prepare datasets for training

In [164]:
y_train = train['y']
x_train = train.loc[:, train.columns != 'y']

In [165]:
y_test = test['y']
x_test = test.loc[:, test.columns != 'y']

## Train model

In [166]:
model = LogisticRegression(solver='lbfgs', max_iter = 1000)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Model evaluation

In [167]:
print("Training accuracy: {}".format(model.score(x_train, y_train)))

predicted = model.predict(x_test)

print("Testing accuracy:  {}".format(metrics.accuracy_score(y_test, predicted)))

Training accuracy: 0.7455705432008749
Testing accuracy:  0.7476260043827612


In [168]:
probabilities = model.predict_proba(x_test)

probabilities

array([[0.87681999, 0.12318001],
       [0.69274159, 0.30725841],
       [0.8106091 , 0.1893909 ],
       ...,
       [0.62361742, 0.37638258],
       [0.80275199, 0.19724801],
       [0.5051571 , 0.4948429 ]])

## Model evaluation using cross-validation

In [169]:
scores = cross_val_score(LogisticRegression(solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=10)

In [170]:
print(scores)
# mean accuracy using cross-validation
print(scores.mean())

[0.74009239 0.73620229 0.73589494 0.75243191 0.78793774 0.74659533
 0.74464981 0.74002918 0.74020919 0.74945269]
0.7473495468678826


## Train model


In [171]:
reg = LinearRegression().fit(x_train, y_train)

In [172]:
reg.score(x_train, y_train)

0.35135172169017925

In [173]:
reg.score(x_test, y_test)

0.29721267776866656