In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from patsy import dmatrices


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import make_pipeline
dta = sm.datasets.fair.load_pandas().data

  from pandas.core import datetools


In [2]:
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666


In [3]:
dta['affair'] = (dta.affairs > 0).astype(int)

In [4]:
dta.head()

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0,0.111111,1
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0,3.230769,1
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0,1.4,1
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0,0.727273,1
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0,4.666666,1


In [5]:
target = dta['affair'].values

## Data Exploration :-

In [6]:
np.bincount(target)              #data is immbalance so 'roc_auc' scoring method will be  better choice.

array([4313, 2053], dtype=int64)

In [7]:
dta.groupby('affair').mean()

Unnamed: 0_level_0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs
affair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,4.329701,28.390679,7.989335,1.238813,2.504521,14.322977,3.405286,3.833758,0.0
1,3.647345,30.537019,11.15246,1.728933,2.261568,13.972236,3.463712,3.884559,2.187243


we can see that woman who have affairs, rate their marriage lower. now lets groupby data according to 'rate_marriage' and try to find any pattern.

In [8]:
dta.groupby('rate_marriage').mean()

Unnamed: 0_level_0,age,yrs_married,children,religious,educ,occupation,occupation_husb,affairs,affair
rate_marriage,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,33.823232,13.914141,2.308081,2.343434,13.848485,3.232323,3.838384,1.201671,0.747475
2.0,30.471264,10.727011,1.735632,2.33046,13.864943,3.327586,3.764368,1.615745,0.635057
3.0,30.008056,10.239174,1.638469,2.308157,14.001007,3.40282,3.79859,1.371281,0.550856
4.0,28.856601,8.816905,1.369536,2.400981,14.144514,3.420161,3.835861,0.674837,0.322926
5.0,28.574702,8.311662,1.252794,2.506334,14.399776,3.454918,3.892697,0.348174,0.181446


In [9]:
data = dta[['rate_marriage', 'age', 'yrs_married', 'children', 'religious', 'educ', 'occupation', 'occupation_husb']]
data.head()                 

Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb
0,3.0,32.0,9.0,3.0,3.0,17.0,2.0,5.0
1,3.0,27.0,13.0,3.0,1.0,14.0,3.0,4.0
2,4.0,22.0,2.5,0.0,1.0,16.0,3.0,5.0
3,4.0,37.0,16.5,4.0,3.0,16.0,5.0,5.0
4,5.0,27.0,9.0,1.0,1.0,14.0,3.0,4.0


data are not in a format that the machine learning algorithms can understand. We need to encode the categorical variables
We can do that using the pandas ``get_dummies`` function:

there are many possible categories. we can not dummyfy all so we have to select some features.<br>

* we'll choose features by feature_selection method.

In [10]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold="median")

In [11]:
select.fit(data, target)
data_rf = select.transform(data)
print(data.shape)
print(data_rf.shape)
print(select.get_support())

(6366, 8)
(6366, 4)
[ True False  True  True False False False  True]


it is showing that `'rate_marriage', 'yrs_married', 'children', 'occupation_husb'` are important features to training a model.<br>
so we'll apply `get_dummies` function only on `'rate_marriage', 'children', 'occupation_husb'` features

In [12]:
data_dummies = pd.get_dummies(data, columns=['rate_marriage', 'children', 'occupation_husb'])
data_dummies.head()

Unnamed: 0,age,yrs_married,religious,educ,occupation,rate_marriage_1.0,rate_marriage_2.0,rate_marriage_3.0,rate_marriage_4.0,rate_marriage_5.0,...,children_2.0,children_3.0,children_4.0,children_5.5,occupation_husb_1.0,occupation_husb_2.0,occupation_husb_3.0,occupation_husb_4.0,occupation_husb_5.0,occupation_husb_6.0
0,32.0,9.0,3.0,17.0,2.0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,0
1,27.0,13.0,1.0,14.0,3.0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
2,22.0,2.5,1.0,16.0,3.0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
3,37.0,16.5,3.0,16.0,5.0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,1,0
4,27.0,9.0,1.0,14.0,3.0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [13]:
dataa = data_dummies.values

In [14]:
x_train, x_test, y_train, y_test = train_test_split(dataa, target,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=target)

pipe = make_pipeline(Imputer(),                   #preprocessing(Imputing)
                     StandardScaler(),            #preprocessing(standard scaling)
                     LogisticRegression(random_state=0))        #estimator model(logistic regression)

cv = StratifiedKFold(n_splits=20, random_state=42)      #defining the type of cross_validation

param_grid = {'logisticregression__C': [0.01, 0.1, 1, 10]}      #tunning parameters
    

grid = GridSearchCV(pipe, param_grid= param_grid, cv=cv, scoring="roc_auc")      

In [15]:
grid.fit(x_train, y_train)           #training

GridSearchCV(cv=StratifiedKFold(n_splits=20, random_state=42, shuffle=False),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'logisticregression__C': [0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [16]:
grid.score(x_test, y_test)       #accuracy of model

0.7386405721852773

In [17]:
grid.best_params_        #best combination of decision tree parameter which results above accuracy

{'logisticregression__C': 0.1}

so accuracy of the model is approximatly 74%.