In [1]:
import pandas as pd
import numpy as np
import os

In [4]:
processed_data_path = os.path.join(os.path.pardir, "data", "processed")
train_file_path = os.path.join(processed_data_path, "train.csv")
test_file_path = os.path.join(processed_data_path, "test.csv")
print(train_file_path, test_file_path)

../data/processed/train.csv ../data/processed/test.csv


In [5]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

# Data Preparation

In [10]:
X = train_df.loc[:, 'Age':].as_matrix().astype('float')

  """Entry point for launching an IPython kernel.


In [11]:
X

array([[22.    ,  7.25  ,  2.    , ...,  1.    ,  1.    ,  0.    ],
       [38.    , 71.2833,  2.    , ...,  0.    ,  1.    ,  0.    ],
       [26.    ,  7.925 ,  1.    , ...,  1.    ,  1.    ,  0.    ],
       ...,
       [22.    , 23.45  ,  4.    , ...,  1.    ,  1.    ,  0.    ],
       [26.    , 30.    ,  1.    , ...,  0.    ,  1.    ,  0.    ],
       [32.    ,  7.75  ,  1.    , ...,  0.    ,  1.    ,  0.    ]])

In [15]:
X.shape

(891, 32)

In [18]:
y = train_df['Survived'].ravel()

In [20]:
y[:10]

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1])

In [21]:
train_df.head()

Unnamed: 0_level_0,Survived,Age,Fare,FamilySize,IsMother,IsMale,Deck_A,Deck_B,Deck_C,Deck_D,...,Title_Sir,Fare_Bin_very_low,Fare_Bin_low,Fare_Bin_high,Fare_Bin_very_high,Embarked_C,Embarked_Q,Embarked_S,AgeState_Adult,AgeState_Child
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,22.0,7.25,2,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,1,0
2,1,38.0,71.2833,2,0,0,0,0,1,0,...,0,0,0,0,1,1,0,0,1,0
3,1,26.0,7.925,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0
4,1,35.0,53.1,2,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,1,0
5,0,35.0,8.05,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,1,0


In [22]:
X.shape, y.shape

((891, 32), (891,))

In [29]:
# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [30]:
print("Mean survival in train is {0:.3f}".format(np.mean(y_train)))
print("Mean survival in  test is {0:.3f}".format(np.mean(y_test)))

Mean survival in train is 0.383
Mean survival in  test is 0.385


# Check sklean version

In [31]:
import sklearn

In [33]:
sklearn.__version__

'0.19.1'

# Baseline Model

In [35]:
from sklearn.dummy import DummyClassifier

In [37]:
# create the model
model_dummy = DummyClassifier(strategy='most_frequent', random_state = 0)

In [38]:
# train the model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

# Baseline accuracy

In [39]:
print("Score for baseline model is {:.2f}".format(model_dummy.score(X_test, y_test)))

Score for baseline model is 0.61


In [40]:
# Performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [41]:
# accuracy score
print("Accuracy for baseline model : {:.2f}".format(accuracy_score(y_test, model_dummy.predict(X_test))))

Accuracy for baseline model : 0.61


In [42]:
# confusion matrix
print("Confusion matrix for baseline model:\n{0}".format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion matrix for baseline model:
[[110   0]
 [ 69   0]]


In [44]:
# precision and recall are always 0 since we never predict positive
print("Precision for baseline model : {:.2f}".format(precision_score(y_test, model_dummy.predict(X_test))))
print("Recall for baseline model : {:.2f}".format(recall_score(y_test, model_dummy.predict(X_test))))

Precision for baseline model : 0.00
Recall for baseline model : 0.00


  'precision', 'predicted', average, warn_for)


# First Kaggle Submission

In [45]:
# convertion to a matrix
test_X = test_df.as_matrix().astype('float')

  


In [49]:
# get predictions
predictions = model_dummy.predict(test_X)

In [50]:
test_df.index

Int64Index([ 892,  893,  894,  895,  896,  897,  898,  899,  900,  901,
            ...
            1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309],
           dtype='int64', name='PassengerId', length=418)

In [52]:
df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})

In [55]:
df_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 2 columns):
PassengerId    418 non-null int64
Survived       418 non-null int64
dtypes: int64(2)
memory usage: 6.6 KB


In [56]:
df_submission.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [57]:
submission_data_path = os.path.join(os.path.pardir, "data", "external")
submission_file_path = os.path.join(submission_data_path, "01_dummy.csv")

In [60]:
df_submission.to_csv(submission_file_path, index=False)

In [61]:
def get_submission_file(model, filename):
    test_X = test_df.as_matrix().astype('float')
    predictions = model.predict(test_X)
    df_submission = pd.DataFrame({'PassengerId': test_df.index, 'Survived': predictions})
    submission_data_path = os.path.join(os.path.pardir, "data", "external")
    submission_file_path = os.path.join(submission_data_path, filename)
    df_submission.to_csv(submission_file_path, index=False)

In [62]:
get_submission_file(model_dummy, "01_dummy.csv")

  


# Logistical Regression Model

In [63]:
from sklearn.linear_model import LogisticRegression

In [64]:
model_lr_1 = LogisticRegression(random_state = 0)

In [66]:
model_lr_1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [67]:
print("Score for model_lr_1 is {:.2f}".format(model_lr_1.score(X_test, y_test)))

Score for model_lr_1 is 0.83


In [68]:
def score_model(model):
    # accuracy score
    print("Accuracy for model : {:.2f}".format(accuracy_score(y_test, model.predict(X_test))))
    print("Confusion matrix for model:\n{0}".format(confusion_matrix(y_test, model.predict(X_test))))
    print("Precision for model : {:.2f}".format(precision_score(y_test, model.predict(X_test))))
    print("Recall for model : {:.2f}".format(recall_score(y_test, model.predict(X_test))))


In [69]:
score_model(model_lr_1)

Accuracy for model : 0.83
Confusion matrix for model:
[[95 15]
 [15 54]]
Precision for model : 0.78
Recall for model : 0.78


In [73]:
model_lr_1.coef_

array([[-0.02842268,  0.00455451, -0.50009089,  0.6178132 , -0.81392331,
         0.12845079, -0.17281789, -0.39317834,  0.52159979,  1.09941224,
         0.40341217, -0.18345052, -0.30036043,  0.96533486,  0.48256744,
        -0.34483448,  0.28089598,  1.21761328,  0.56363966, -1.44586305,
         1.07245548, -0.11273708, -0.47293646,  0.16255648,  0.24716933,
         0.28009428,  0.41324773,  0.49183528,  0.46198829,  0.14924424,
         0.37283516,  0.73023265]])

In [74]:
get_submission_file(model_lr_1, "02_lr.csv")

  


# Hyperparameter Optimization
This is the practice of adjusting the parameters of the alorithm itself (as opposed to the features in the training data)
To evaluate the various algorithm parameters split the training data again to create a "cross-validation" set.  This is used to evaluate the hyperparameters.

# Grid Search & K-Fold Crossvalidation

In [75]:
model_lr = LogisticRegression(random_state=0)

In [76]:
from sklearn.model_selection import GridSearchCV

In [77]:
parameters = { 'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1','l2']}

In [78]:
parameters

{'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']}

In [79]:
clf = GridSearchCV(model_lr, param_grid=parameters, cv=3)

In [80]:
clf.fit(X_train, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [81]:
clf.best_params_

{'C': 1.0, 'penalty': 'l1'}

In [82]:
clf.best_score_

0.8328651685393258

In [83]:
score_model(clf)

Accuracy for model : 0.83
Confusion matrix for model:
[[94 16]
 [15 54]]
Precision for model : 0.77
Recall for model : 0.78


# Make the third submission

In [84]:
get_submission_file(clf, "03_lr.csv")

  


# Feature Normalization and Standardization
Try to provide features on the same scale (0 to 1 or -1 to 1)

In [85]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

## Feature Normalization
Note that the second call is to transform -- not fit transform.  Once the scaler knows how to scale a feature, it has to be consistent on the train and test datasets.

In [87]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [88]:
X_test_scaled = scaler.transform(X_test)

## Different scaler

In [90]:
scaler = StandardScaler()

In [91]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Create the model after standardization

In [94]:
model_lr = LogisticRegression(random_state=0)
parameters = { 'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1','l2']}
clf = GridSearchCV(model_lr, param_grid=parameters, cv=3)
clf.fit(X_train_scaled, y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 50.0, 100.0, 1000.0], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [95]:
clf.best_score_

0.8132022471910112

In [96]:
score_model(clf)

Accuracy for model : 0.71
Confusion matrix for model:
[[106   4]
 [ 48  21]]
Precision for model : 0.84
Recall for model : 0.30


In [97]:
print("Best score for logistic regression - version 2 is {:.2f}".format(clf.score(X_test_scaled, y_test)))

Best score for logistic regression - version 2 is 0.84


In [98]:
score_model(clf)

Accuracy for model : 0.71
Confusion matrix for model:
[[106   4]
 [ 48  21]]
Precision for model : 0.84
Recall for model : 0.30


## The aforementioned scaling is less important with logistic regression than with other techniques

# Model Persistence

In [100]:
import pickle

In [101]:
model_file_path = os.path.join(os.path.pardir, "models", "lr_model.pkl")
scaler_file_path = os.path.join(os.path.pardir, "models", "lr_scaler.pkl")

In [102]:
model_file_pickle = open(model_file_path, 'wb')
scaler_file_pickle = open(scaler_file_path, 'wb')

In [103]:
pickle.dump(clf, model_file_pickle)
pickle.dump(scaler, scaler_file_pickle)

In [104]:
model_file_pickle.close()
scaler_file_pickle.close()

In [107]:
model_file_pickle = open(model_file_path, 'rb')
scaler_file_pickle = open(scaler_file_path, 'rb')

In [108]:
clf_loaded = pickle.load(model_file_pickle)
scaler_loaded = pickle.load(scaler_file_pickle)

In [109]:
model_file_pickle.close()
scaler_file_pickle.close()

In [110]:
X_test_scaled = scaler_loaded.transform(X_test)

In [111]:
print("Score for logistic regression is {:.2f}".format(clf_loaded.score(X_test_scaled, y_test)))

Score for logistic regression is 0.84
