In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Import data

In [131]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [132]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [133]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [134]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

## Data preparation

In [135]:
X = train_df.loc[:,'Age':].as_matrix().astype('float')
y = train_df['Survived'].ravel()

In [136]:
print(X.shape,y.shape)

(891, 32) (891,)


In [137]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [138]:
print('mean survival in train :{0:0.3f}'.format(y_train.mean()))
print('mean survival in train :{0:0.3f}'.format(y_test.mean()))

mean survival in train :0.383
mean survival in train :0.385


###### Check Scikit-Learn version

In [139]:
import sklearn

In [140]:
sklearn.__version__

'0.19.1'

### Create Baseline model

In [141]:
from sklearn.dummy import DummyClassifier

In [142]:
model_dummy = DummyClassifier(strategy='most_frequent',random_state=0)

In [143]:
model_dummy.fit(X_train,y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [144]:
print('Score baseline model : {0:2f}'.format(model_dummy.score(X_test,y_test)))

Score baseline model : 0.614525


In [145]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report, precision_score,recall_score

In [146]:
print('accuracy for baseline model {0:.2f}'.format(
    accuracy_score(y_test,model_dummy.predict(X_test))))

accuracy for baseline model 0.61


In [147]:
print ('confusion matrix \n {0}'.format(
    confusion_matrix(y_test,model_dummy.predict(X_test))))

confusion matrix 
 [[110   0]
 [ 69   0]]


In [148]:
print('precision score : {0:2f}'.format(
    precision_score(y_test,model_dummy.predict(X_test))))

precision score : 0.000000


  'precision', 'predicted', average, warn_for)


In [149]:
print('recall score : {0:2f}'.format(
    recall_score(y_test,model_dummy.predict(X_test))))

recall score : 0.000000


### First Kaggle submission

In [150]:
# converting to the matrix

test_X = test_df.as_matrix().astype('float')

In [151]:
predictions = model_dummy.predict(test_X)

In [152]:
df_submission = pd.DataFrame({'PassengerId':test_df.index,'Survived':predictions})

In [153]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [154]:
submissions_data_path = os.path.join(os.path.pardir,'data','external')
submssion_file_path = os.path.join(submissions_data_path,'01_dummy.csv')

In [155]:
df_submission.to_csv(submssion_file_path,index=False)

In [156]:

def get_submission_file(model,filename):
    test_X = test_df.as_matrix().astype('float')
    predictions = model.predict(test_X)
    df_submission = pd.DataFrame(
        {'PassengerId':test_df.index,'Survived':predictions})
    submissions_data_path = os.path.join(os.path.pardir,'data','external')
    submssion_file_path = os.path.join(submissions_data_path,filename)
    df_submission.to_csv(submssion_file_path,index=False)

In [157]:

get_submission_file(model_dummy,'01_dummy.csv')

# Logistic Regression model

In [158]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model_lr_1 = LogisticRegression(random_state=0)
#train model
model_lr_1.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [159]:
print ('score for logistic regression - version 1 : {0:.2f}'.format(model_lr_1.score(X_test, y_test)))

score for logistic regression - version 1 : 0.83


In [160]:
print ('confusion matrix \n {0}'.format(
    confusion_matrix(y_test,model_lr_1.predict(X_test))))

confusion matrix 
 [[95 15]
 [15 54]]


In [162]:
print('precision score : {0:2f}'.format(
    precision_score(y_test,model_lr_1.predict(X_test))))
print('recall score : {0:2f}'.format(
    recall_score(y_test,model_lr_1.predict(X_test))))

precision score : 0.782609
recall score : 0.782609


In [163]:
model_lr_1.coef_    

array([[-0.03295336,  0.00428141, -0.49115374,  0.63439222,  0.        ,
         0.09704792, -0.18014511, -0.41899241,  0.50720856,  1.0868588 ,
         0.37425869, -0.16725872, -0.35294457,  0.89950787,  0.43045443,
        -0.38392912,  0.36412404,  1.00347571,  0.97698315, -1.84487321,
         1.4291411 , -0.33473216, -0.64808544,  0.09229949,  0.20283841,
         0.25743714,  0.39345813,  0.46431938,  0.38996677,  0.09174702,
         0.46463139,  0.48140179]])

### Second Kaggle Submission

In [164]:
get_submission_file(model_lr_1,'02_lr.csv')

# Underfitting vs. Overfittng

#### Regularization parameter - 
parameter C - C value lower fitting or underfitting adjustment
Penality - is also affects



In [165]:
model_lr = LogisticRegression(random_state=0)

In [166]:
from sklearn.model_selection import GridSearchCV

In [167]:
parameters = {'C':[1.0,10.0,50.0,100.0,1000.0],'penalty': ['l1','l2']}
clr = GridSearchCV(model_lr,param_grid=parameters,cv=3)

In [168]:
clr.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [169]:
clr.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [170]:
print('best score {0:.2f}'.format(clr.best_score_))

best score 0.83


In [171]:
get_submission_file(clr,'03.lr.csv')

## Feature Normalization
Keep the values between 0-1 or -1 to 1 based on your data

In [172]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

In [173]:
scalar = MinMaxScaler()
X_train_scaled = scalar.fit_transform(X_train)

In [174]:
scalar = StandardScaler()
X_train_scaled = scalar.fit_transform(X_train)
X_test_scaled  = scalar.fit_transform(X_test)


In [175]:
model_lr = LogisticRegression(random_state=0)
parameters = {'C':[1.0,10.0,50.0,100.0,1000.0],'penalty': ['l1','l2']}
clr = GridSearchCV(model_lr,param_grid=parameters,cv=3)
clr.fit(X_train_scaled,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [176]:
clr.best_score_

0.8160112359550562

In [177]:
print('best score {0:.2f}'.format(clr.score(X_test_scaled,y_test)))

best score 0.83


# Model persistence

Take your trained model and saved into desk, and use whenever you want by reading file

In [178]:
import pickle

In [179]:
 
model_file_path = os.path.join(os.path.pardir,'models','lr_model.pkl')
scaler_file_path = os.path.join(os.path.pardir,'models','lr_scaler.pkl')

In [180]:
model_file_pickle = open(model_file_path, 'wb')
scaler_file_pickle = open(scaler_file_path, 'wb')

In [181]:
# persist the model and scaler
pickle.dump(clr, model_file_pickle)
pickle.dump(scalar, scaler_file_pickle)

In [182]:
model_file_pickle.close()
scaler_file_pickle.close()

In [186]:
# open files in read mode
model_file_pickle = open(model_file_path, 'rb')
scaler_file_pickle = open(scaler_file_path,'rb')
# load files
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)
# close files
model_file_pickle.close()
scaler_file_pickle.close()

In [187]:
clf_loaded

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [188]:
scaler_loaded

StandardScaler(copy=True, with_mean=True, with_std=True)

In [189]:
 X_test_scaled = scaler_loaded.transform(X_test)

In [192]:
print( 'score for persisted logistic regression : {0:.2f}'.format(clf_loaded.score(X_test_scaled, y_test)))

score for persisted logistic regression : 0.83
