# Building Predictive Models

In [1]:
# imports
import pandas as pd
import numpy as np
import os

## Import Data

In [2]:
# set the path of the processed data
processed_data_path = os.path.join(os.path.pardir, 'data', 'processed')
train_file_path = os.path.join(processed_data_path, 'train.csv')
test_file_path = os.path.join(processed_data_path, 'test.csv')

In [3]:
train_df = pd.read_csv(train_file_path, index_col='PassengerId')
test_df = pd.read_csv(test_file_path, index_col='PassengerId')

In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 33 columns):
Survived              891 non-null int64
Age                   891 non-null float64
Fare                  891 non-null float64
FamilySize            891 non-null int64
IsMother              891 non-null int64
IsMale                891 non-null int64
Deck_A                891 non-null int64
Deck_B                891 non-null int64
Deck_C                891 non-null int64
Deck_D                891 non-null int64
Deck_E                891 non-null int64
Deck_F                891 non-null int64
Deck_G                891 non-null int64
Deck_Z                891 non-null int64
Pclass_1              891 non-null int64
Pclass_2              891 non-null int64
Pclass_3              891 non-null int64
Title_Lady            891 non-null int64
Title_Master          891 non-null int64
Title_Miss            891 non-null int64
Title_Mr              891 non-null int64
Title_Mrs             891 non-

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 32 columns):
Age                   418 non-null float64
Fare                  418 non-null float64
FamilySize            418 non-null int64
IsMother              418 non-null int64
IsMale                418 non-null int64
Deck_A                418 non-null int64
Deck_B                418 non-null int64
Deck_C                418 non-null int64
Deck_D                418 non-null int64
Deck_E                418 non-null int64
Deck_F                418 non-null int64
Deck_G                418 non-null int64
Deck_Z                418 non-null int64
Pclass_1              418 non-null int64
Pclass_2              418 non-null int64
Pclass_3              418 non-null int64
Title_Lady            418 non-null int64
Title_Master          418 non-null int64
Title_Miss            418 non-null int64
Title_Mr              418 non-null int64
Title_Mrs             418 non-null int64
Title_Officer         418 n

## Data Preperation

In [6]:
X = train_df.loc[:, 'Age':].as_matrix().astype('float')
y = train_df['Survived'].ravel()

  """Entry point for launching an IPython kernel.


In [7]:
print(X.shape, y.shape)

(891, 32) (891,)


In [8]:
# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(712, 32) (712,)
(179, 32) (179,)


In [9]:
# average survival in train and test sets
print(f'mean survival in train: {round(np.mean(y_train), 3)}')
print(f'mean survival in test: {round(np.mean(y_test), 3)}')

mean survival in train: 0.383
mean survival in test: 0.385


## Check Scikit-Learn Version

In [10]:
import sklearn

In [11]:
sklearn.__version__

'0.21.3'

## Baseline Model

In [12]:
# import function
from sklearn.dummy import DummyClassifier

In [13]:
# create model
model_dummy = DummyClassifier(strategy='most_frequent', random_state=0)

In [14]:
# train model
model_dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=0, strategy='most_frequent')

In [15]:
print(f'score for baseline model: {round(model_dummy.score(X_test, y_test), 2)}')

score for baseline model: 0.61


In [17]:
# performance metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [18]:
# accuracy score
print(f'accuracy for baseline model: {round(accuracy_score(y_test, model_dummy.predict(X_test)), 2)}')

accuracy for baseline model: 0.61


In [19]:
# confusion matrix
print(f'confusion matrix for baseline model: \n {confusion_matrix(y_test, model_dummy.predict(X_test))}')

confusion matrix for baseline model: 
 [[110   0]
 [ 69   0]]


In [22]:
# precision and recall scores
print(f'precision for baseline model: {round(precision_score(y_test, model_dummy.predict(X_test)), 2)}')
print(f'recall for baseline model: {round(recall_score(y_test, model_dummy.predict(X_test)), 2)}')

precision for baseline model: 0.0
recall for baseline model: 0.0


## First Kaggle Submission

In [23]:
# converting to the matrix 
test_X = test_df.as_matrix().astype('float')

  


In [27]:
# get predictions
predictions = model_dummy.predict(test_X)

In [28]:
df_submission = pd.DataFrame( {'PassengerId' : test_df.index, 'Survived' : predictions} )

In [29]:
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [30]:
submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
submission_file_path = os.path.join(submission_data_path, '01_dummy.csv')

In [31]:
df_submission.to_csv(submission_file_path, index=False)

In [36]:
# create a function to automate this process
def get_submission_file(model, filename):
    # convert to matrix
    test_X = test_df.as_matrix().astype('float')
    # make predictions
    predictions = model.predict(test_X)
    # submission dataframe
    df_submission = pd.DataFrame( {'PassengerId' : test_df.index, 'Survived' : predictions })
    # submission file
    submission_data_path = os.path.join(os.path.pardir, 'data', 'external')
    submission_file_path = os.path.join(submission_data_path, filename)
    # write to file
    df_submission.to_csv(submission_file_path, index=False)

In [37]:
# get submission file
get_submission_file(model_dummy, '01_dummy.csv')

  after removing the cwd from sys.path.


In [None]:
# kaggle submission via API
# kaggle competitions submit -c titanic -f submission.csv -m "Message"

## Next Step