# Logistic regression  for Census dataset

In [240]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score

import pandas_profiling

import seaborn as sns
import plotly
%matplotlib inline
import plotly.plotly as py
import matplotlib.pyplot as plt
from matplotlib import style

from random import shuffle

In [241]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 'occupation',
          'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'y']

In [242]:
adult_train = pd.read_csv('adult_train.csv', header = None)
adult_test  = pd.read_csv('adult_test.csv', header = None)
adult_train.columns = cols
adult_test.columns = cols
adult_train.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Strategy 1

In [243]:
# remove whitespaces from the beginning of categorical values
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [244]:
adult_train.head(4)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K


In [245]:
adult_train["workclass"].value_counts()


Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [246]:
print(len(adult_train))
adult_train = adult_train[adult_train["workclass"]  != "?"]
adult_test = adult_test[adult_test["workclass"]  != "?"]
len(adult_train)

32561


30725

In [247]:
adult_train["occupation"].value_counts()


Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
?                       7
Name: occupation, dtype: int64

In [248]:
adult_train = adult_train[adult_train["occupation"] != "Other-service"]
adult_test = adult_test[adult_test["occupation"] != "Other-service"]
len(adult_train)

27430

In [249]:
adult_train["native_country"].value_counts()


United-States                 24734
Mexico                          487
?                               473
Philippines                     152
Germany                         121
India                            96
Canada                           95
Puerto-Rico                      93
Cuba                             79
England                          79
South                            60
Jamaica                          60
El-Salvador                      60
Italy                            58
Guatemala                        56
Dominican-Republic               55
Vietnam                          55
China                            52
Japan                            50
Poland                           50
Columbia                         48
Taiwan                           41
Iran                             38
Portugal                         32
Nicaragua                        27
Haiti                            26
Greece                           25
France                      

In [250]:
print(len(adult_train))
adult_train = adult_train[adult_train["native_country"]  != "?"]
adult_test = adult_test[adult_test["native_country"]  != "?"]
len(adult_train)

27430


26957

In [251]:
def logisticRegressionHelper(adult_train, adult_test):
    train = pd.get_dummies(adult_train, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
    test  = pd.get_dummies(adult_test, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
    
    # transform y
    train.loc[train['y'] == '>50K', 'y'] = 1
    train.loc[train['y'] == '<=50K', 'y'] = 0

    test.loc[test['y'] == '>50K.', 'y'] = 1
    test.loc[test['y'] == '<=50K.', 'y'] = 0
    
    test['native_country_Holand-Netherlands'] = 0
    
    if len(set(train.columns).difference(set(test.columns))) != 0:
        print("We miss suc columns {}".format(set(train.columns).difference(set(test.columns))))
        return
    
    y_train = train['y']
    x_train = train.loc[:, train.columns != 'y']
    
    y_test = test['y']
    x_test = test.loc[:, test.columns != 'y']
    
    model = LogisticRegression(solver='lbfgs', max_iter = 1000)
    model.fit(x_train, y_train)
    
    print("Training accuracy: {}".format(model.score(x_train, y_train)))

    predicted = model.predict(x_test)
    first_accuracy = [model.score(x_train, y_train), metrics.accuracy_score(y_test, predicted)]
    
    print("Testing accuracy:  {}".format(first_accuracy[-1]))
    
    return first_accuracy

In [252]:
first_accuracy = logisticRegressionHelper(adult_train, adult_test)
first_accuracy

Training accuracy: 0.77171050191045
Testing accuracy:  0.7742630132917502


[0.77171050191045, 0.7742630132917502]

The most drawback is that we remove part of data, where we have some values, from which our model can learn something. Using this dataset it wasn't so painful as if we used a smaller one.


## Strategy 2

In [253]:
adult_train = pd.read_csv('adult_train.csv', header = None)
adult_test  = pd.read_csv('adult_test.csv', header = None)
adult_train.columns = cols
adult_test.columns = cols

In [254]:
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [255]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'mean')
adult_test[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_gain', 'capital_loss']])
adult_train[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_gain', 'capital_loss']])

In [256]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

In [257]:
second_accuracy = logisticRegressionHelper(adult_train, adult_test)
second_accuracy

Training accuracy: 0.7562114185682258
Testing accuracy:  0.7583072292856704


[0.7562114185682258, 0.7583072292856704]

We can see that this strategy work worse than previous one, but for real life it is more suitable. As our previous data was too beautiful and we have smaller segment of all people dataset provide.

We create very similar data, for model it is bad. As it may learn only that features that often appear.
This may be useful when small amount of data missed ( < 10%), we missed near 17%.  

## Strategy 3

In [258]:
adult_train = pd.read_csv('adult_train.csv', header = None)
adult_test  = pd.read_csv('adult_test.csv', header = None)
adult_train.columns = cols
adult_test.columns = cols

In [259]:
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [260]:
adult_train["native_country"].value_counts()

United-States                 29170
Mexico                          643
?                               583
Philippines                     198
Germany                         137
Canada                          121
Puerto-Rico                     114
El-Salvador                     106
India                           100
Cuba                             95
England                          90
Jamaica                          81
South                            80
China                            75
Italy                            73
Dominican-Republic               70
Vietnam                          67
Guatemala                        64
Japan                            62
Poland                           60
Columbia                         59
Taiwan                           51
Haiti                            44
Iran                             43
Portugal                         37
Nicaragua                        34
Peru                             31
France                      

In [261]:
countries2change ={
    "United-States": "North America",
    "Mexico" : "North America",
    "Philippines" : "Asia",
    "Puerto-Rico" :  "North America",
    "Germany" : "Europe",
    "Canada" : "North America",
    "India" : "Asia",
    "El-Salvador" : "North America",
    "China" : "Asia",
    "Cuba" : "North America",
    "England" : "Europe",
    #don't know which part of World is South =(
    "South" : "?",
    "Dominican-Republic" : "North America",
    "Italy" : "Europe",
    "Haiti" : "North America",
    "Japan" : "Asia",
    "Portugal" : "Europe",
    "Poland" : "Europe",
    "Columbia" : "South America",
    "Jamaica" : "North America",
    "Guatemala" : "North America",
    "Greece" : "Europe",
    "Vietnam" : "Asia",
    "Ecuador" : "South America",
    "Iran" : "Asia",
    "Peru" : "South America",
    "Nicaragua" : "North America",
    "Taiwan" : "Asia",
    "Ireland" : "Europe",
    "Thailand" : "Asia",
    "Hong" : "Asia",
    "France" : "Europe",
    "Outlying-US(Guam-USVI-etc)" :  "North America",
    "Scotland" : "Europe",
    "Cambodia" : "Asia",
    "Trinadad&Tobago" : "North America",
    "Honduras" : "North America",
    "Yugoslavia" : "Europe",
    "Hungary" : "Europe",
    "Laos" : "Asia",
    "Holand-Netherlands" : "Europe"
}

In [262]:
adult_train = adult_train.replace(countries2change)
adult_test = adult_test.replace(countries2change)


In [263]:
general_train = adult_train.copy()
general_test = adult_test.copy()

In [264]:
adult_train["native_country"].value_counts()


North America    30588
Asia               671
?                  663
Europe             521
South America      118
Name: native_country, dtype: int64

In [265]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'mean')
adult_test[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_gain', 'capital_loss']])
adult_train[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_gain', 'capital_loss']])

# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', ]] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', ]])
adult_test[['workclass', 'occupation', ]] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', ]])

In [266]:
train = pd.get_dummies(adult_train, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', ])
test  = pd.get_dummies(adult_test, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', ])

# transform y
train.loc[train['y'] == '>50K', 'y'] = 1
train.loc[train['y'] == '<=50K', 'y'] = 0

test.loc[test['y'] == '>50K.', 'y'] = 1
test.loc[test['y'] == '<=50K.', 'y'] = 0

# test['native_country_Holand-Netherlands'] = 0

In [267]:
def missing_value_imputation(train, to_predict, missed_value):
    if len(set(train.columns).difference(set(to_predict.columns))) != 0:
        print("We miss suc columns {}".format(set(train.columns).difference(set(to_predict.columns))))
        return
    y_train = train.loc[:, train.columns == missed_value]
    x_train = train.loc[:, train.columns != missed_value]
    x_test = to_predict.loc[:, train.columns != missed_value]
    
    model = LogisticRegression(solver='lbfgs', max_iter = 1000, class_weight='balanced')
    model.fit(x_train, y_train)
    
    predicted = model.predict(x_test)
    x_test[missed_value] = predicted
    print("Training accuracy: {}".format(model.score(x_train, y_train)))

    return x_test

In [268]:
missed = [x for x in list(train) if x.startswith("native_country")]
missed

['native_country']

In [269]:
train_country = train.loc[train["native_country"] != "?", :]
test_country = train.loc[train["native_country"] == "?", :]
train_country["native_country"].value_counts()

North America    30588
Asia               671
Europe             521
South America      118
Name: native_country, dtype: int64

In [270]:
shuffle(missed)
country_res = missing_value_imputation(train_country, test_country, missed[0])
country_res["native_country"].value_counts()


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().





Training accuracy: 0.14809705937676343


Asia             284
Europe           151
South America    151
North America     77
Name: native_country, dtype: int64

In [283]:
general_train["native_country"] = general_train["native_country"].apply(lambda x: x if x != "?" else np.nan)


In [276]:
def change_missed_val(data, pred, val):
    j = 0 
    for i in range(data.shape[0]):
        if pd.isna(data.loc[i,val]):
            data.loc[i,val] = pred.loc[pred.index[j], val]
            j += 1
    return data


In [277]:
general_train = change_missed_val(general_train, country_res, "native_country")
general_train["native_country"].value_counts()

North America    30665
Asia               955
Europe             672
South America      269
Name: native_country, dtype: int64

In [279]:
test

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,native_country,y,workclass_Federal-gov,workclass_Local-gov,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male
0,25,226802,7,13314.056689,1875.606815,40,North America,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1,38,89814,9,13314.056689,1875.606815,50,North America,0,0,0,...,0,0,0,0,0,0,0,1,0,1
2,28,336951,12,13314.056689,1875.606815,40,North America,1,0,1,...,0,0,0,0,0,0,0,1,0,1
3,44,160323,10,7688.000000,1875.606815,40,North America,1,0,0,...,0,0,0,0,0,1,0,0,0,1
4,18,103497,10,13314.056689,1875.606815,30,North America,0,0,0,...,1,0,0,0,0,0,0,1,1,0
5,34,198693,6,13314.056689,1875.606815,30,North America,0,0,0,...,0,0,0,0,0,0,0,1,0,1
6,29,227026,9,13314.056689,1875.606815,40,North America,0,0,0,...,0,1,0,0,0,1,0,0,0,1
7,63,104626,15,3103.000000,1875.606815,32,North America,1,0,0,...,0,0,0,0,0,0,0,1,0,1
8,24,369667,10,13314.056689,1875.606815,40,North America,0,0,0,...,0,1,0,0,0,0,0,1,1,0
9,55,104996,4,13314.056689,1875.606815,10,North America,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [280]:
train_country = test.loc[ test["native_country"] != "?", :]
test_country = test.loc[ test["native_country"] == "?", :]
train_country["native_country"].value_counts()

North America    15345
Asia               310
Europe             259
South America       58
Name: native_country, dtype: int64

In [281]:
shuffle(missed)
country_res = missing_value_imputation(train_country, test_country, missed[0])
country_res["native_country"].value_counts()


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().





Training accuracy: 0.08696468820435763


Asia             130
Europe            82
South America     82
North America     15
Name: native_country, dtype: int64

In [284]:
general_test["native_country"] = general_test["native_country"].apply(lambda x: x if x != "?" else np.nan)


In [285]:
general_test = change_missed_val(general_test, country_res, "native_country")
general_test["native_country"].value_counts()

North America    15360
Asia               440
Europe             341
South America      140
Name: native_country, dtype: int64

## Missing value imputation

In [228]:
# remove whitespaces from the beginning of categorical values
for col in adult_train.columns:
    if col not in ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']:
        adult_test[col] = adult_test[col].str.strip()
        adult_train[col] = adult_train[col].str.strip()

In [229]:
# numeric imputation
numeric_imputer = SimpleImputer(missing_values = 0, strategy = 'median')
adult_test[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_test[['capital_gain', 'capital_loss']])
adult_train[['capital_gain', 'capital_loss']] = numeric_imputer.fit_transform(adult_train[['capital_gain', 'capital_loss']])

In [230]:
# categorical imputation
categoric_imputer = SimpleImputer(missing_values= '?',strategy='most_frequent')
adult_train[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_train[['workclass', 'occupation', 'native_country']])
adult_test[['workclass', 'occupation', 'native_country']] = categoric_imputer.fit_transform(adult_test[['workclass', 'occupation', 'native_country']])

## Transformation of categorical values to numerical ones

In [231]:
train = pd.get_dummies(adult_train, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])
test  = pd.get_dummies(adult_test, columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country'])

In [232]:
train.head(5)

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,y,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native_country_Asia,native_country_Europe,native_country_North America,native_country_South America
0,39,77516,13,2174.0,1871.428571,40,<=50K,0,0,0,...,0,0,0,1,0,1,0,0,1,0
1,50,83311,13,12938.541298,1871.428571,13,<=50K,0,0,0,...,0,0,0,1,0,1,0,0,1,0
2,38,215646,9,12938.541298,1871.428571,40,<=50K,0,0,0,...,0,0,0,1,0,1,0,0,1,0
3,53,234721,7,12938.541298,1871.428571,40,<=50K,0,0,0,...,0,1,0,0,0,1,0,0,1,0
4,28,338409,13,12938.541298,1871.428571,40,<=50K,0,0,0,...,0,1,0,0,1,0,0,0,1,0


In [233]:
# transform y

train.loc[train['y'] == '>50K', 'y'] = 1
train.loc[train['y'] == '<=50K', 'y'] = 0

test.loc[test['y'] == '>50K.', 'y'] = 1
test.loc[test['y'] == '<=50K.', 'y'] = 0

In [234]:
# add missing column to test dataset
test['native_country_Holand-Netherlands'] = 0

In [235]:
# now no difference
set(train.columns).difference(set(test.columns))

set()

## Prepare datasets for training

In [236]:
y_train = train['y']
x_train = train.loc[:, train.columns != 'y']

In [237]:
y_test = test['y']
x_test = test.loc[:, test.columns != 'y']

## Train model

In [238]:
model = LogisticRegression(solver='lbfgs', max_iter = 1000)
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

## Model evaluation

In [239]:
print("Training accuracy: {}".format(model.score(x_train, y_train)))

predicted = model.predict(x_test)

print("Testing accuracy:  {}".format(metrics.accuracy_score(y_test, predicted)))

Training accuracy: 0.7562114185682258


ValueError: X has 69 features per sample; expecting 68

In [None]:
probabilities = model.predict_proba(x_test)

probabilities

## Model evaluation using cross-validation

In [None]:
scores = cross_val_score(LogisticRegression(solver='lbfgs', max_iter = 1000), x_train.append(x_test, sort=False), y_train.append(y_test), scoring='accuracy', cv=10)

In [None]:
print(scores)
# mean accuracy using cross-validation
print(scores.mean())

## Train model


In [None]:
reg = LinearRegression().fit(x_train, y_train)

In [None]:
reg.score(x_train, y_train)

In [None]:
reg.score(x_test, y_test)