In [1]:
import numpy as np 
import pandas as pd
import os


In [2]:
processed_data_path = os.path.join(os.path.pardir,'data','processed')
train_file_path = os.path.join(processed_data_path,'train.csv')
test_file_path = os.path.join(processed_data_path,'test.csv')

In [9]:
 train_df = pd.read_csv(train_file_path, index_col = 'PassengerId')
 test_df = pd.read_csv(test_file_path, index_col = 'PassengerId')

In [10]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Survived            891 non-null    int64  
 1   Age                 891 non-null    float64
 2   Fare                891 non-null    float64
 3   FamilySize          891 non-null    int64  
 4   IsMother            891 non-null    int64  
 5   IsMale              891 non-null    int64  
 6   Deck_A              891 non-null    int64  
 7   Deck_B              891 non-null    int64  
 8   Deck_C              891 non-null    int64  
 9   Deck_D              891 non-null    int64  
 10  Deck_E              891 non-null    int64  
 11  Deck_F              891 non-null    int64  
 12  Deck_G              891 non-null    int64  
 13  Deck_Z              891 non-null    int64  
 14  Pclass_1            891 non-null    int64  
 15  Pclass_2            891 non-null    int64  
 16  Pclass_3

In [15]:
X = train_df.loc[:,'Age':].astype('float')
y = train_df['Survived'].ravel()

In [17]:
print(X.shape,y.shape)

(891, 32) (891,)


In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
print(X_train.shape,y_train.shape)
print(X_test.shape,y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [29]:
print('Mean survival in Train: {0:.4f}'.format(np.mean(y_train)))
print('Mean survival in Test: {0:.4f}'.format(np.mean(y_test)))

Mean survival in Train: 0.3834
Mean survival in Test: 0.3855


In [31]:
####Base line Model###

import sklearn
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score 

In [32]:
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)
model_dummy.fit(X_train, y_train)

DummyClassifier(random_state=0, strategy='most_frequent')

In [33]:
print('Score for baseline model : {0:2f}'.format(model_dummy.score(X_test, y_test)))

Score for baseline model : 0.614525


In [34]:
print('Accuracy for baseline model : {0:2f}'.format(accuracy_score(y_test, model_dummy.predict(X_test))))

Score for baseline model : 0.614525


In [36]:
print('Confusion Matrix for baseline model : {0}'.format(confusion_matrix(y_test, model_dummy.predict(X_test))))

Confusion Matrix for baseline model : [[110   0]
 [ 69   0]]


In [37]:
print('Recall for baseline model : {0:2f}'.format(recall_score(y_test, model_dummy.predict(X_test))))
print('Precision for baseline model : {0:2f}'.format(precision_score(y_test, model_dummy.predict(X_test))))

Recall for baseline model : 0.000000
Precision for baseline model : 0.000000


In [41]:
####Kaggle Submission####

test_X = test_df.astype('float')
predictions = model_dummy.predict(test_X)

df_submission = pd.DataFrame({'PassengerId' : test_df.index, 'Survived' : predictions})

df_submission.head()


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [51]:
def get_submission_file(model,filename):

    test_X = test_df.astype('float')
    predictions = model_dummy.predict(test_X)

    df_submission = pd.DataFrame({'PassengerId' : test_df.index, 'Survived' : predictions})
    submission_data_path = os.path.join(os.path.pardir,'data','external')
    submission_file_path = os.path.join(submission_data_path,'01_dummy.csv')
    df_submission.to_csv(submission_file_path, index=False)

In [52]:
get_submission_file(model_dummy,'01_dummy.csv')

In [54]:
#####LOGISTIC REGRESSION####


from sklearn.linear_model import LogisticRegression

model_lr_1 = LogisticRegression(random_state=0)

model_lr_1.fit(X_train, y_train)

print('Score for baseline model : {0:2f}'.format(model_lr_1.score(X_test, y_test)))

Score for baseline model : 0.832402


In [56]:
print('Accuracy for Log Reg model : {0:2f}'.format(accuracy_score(y_test, model_lr_1.predict(X_test))))
print('Confusion Matrix for Log Reg model : {0}'.format(confusion_matrix(y_test, model_lr_1.predict(X_test))))
print('Recall for Log Reg model : {0:2f}'.format(recall_score(y_test, model_lr_1.predict(X_test))))
print('Precision for Log Reg model : {0:2f}'.format(precision_score(y_test, model_lr_1.predict(X_test))))

Accuracy for Log Reg model : 0.832402
Confusion Matrix for Log Reg model : [[95 15]
 [15 54]]
Recall for Log Reg model : 0.782609
Precision for Log Reg model : 0.782609


In [57]:
model_lr_1.coef_

array([[-0.02957805,  0.00414578, -0.48810285,  0.55878437, -0.76107084,
         0.12133852, -0.07416643, -0.35964014,  0.47360842,  1.01792804,
         0.26480128, -0.04875499, -0.36231604,  0.90661927,  0.4916551 ,
        -0.3654757 ,  0.10805048,  1.03871313,  0.52132587, -1.49058009,
         1.16449316, -0.10040023, -0.20880365,  0.11932276,  0.22344492,
         0.26960465,  0.42042634,  0.44317566,  0.47305624,  0.11656676,
         0.33832147,  0.6944772 ]])

In [None]:
get_submission_file(model_lr_1,'02_LR.csv')