In [68]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import random

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [69]:
# # Turn off warning messages
import warnings
warnings.filterwarnings("ignore")

In [70]:
# Read in resources
train = pd.read_csv(Path("./Resources/train.csv"))
test = pd.read_csv(Path("./Resources/test.csv"))
test_survived = pd.read_csv(Path("./Resources/gender_submission.csv"))

In [71]:
# add back survived column to test dataset
joined = pd.merge(test, test_survived, how='outer', on="PassengerId")
joined.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1


In [72]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [73]:
# Observe missing values for Age, Cabin, Embarked
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [74]:
# few missing values for Embarked; no significant loss to drop
train = train[train['Embarked'].notna()]
test = joined[joined['Embarked'].notna()]

# str astypes, just in case
for df in [train, joined]:
  df['Ticket'] = df['Ticket'].astype(str)
  df['Embarked'] = df['Embarked'].astype(str)
  df['Name'] = df['Name'].astype(str)
  df['Sex'] = df['Sex'].astype(str)

In [75]:
# fill missing age values
for df in [train, joined]:
    ages = df[df['Age'].notna()]['Age'].to_list()
    # print(np.mean(ages), np.std(ages))
    missing_age_idxs = df[df['Age'].isnull()].index.to_list()
    for idx in missing_age_idxs:
        df['Age'][idx] = random.choice(ages)

    filled = df['Age'].loc[missing_age_idxs].to_list()
    # print(np.mean(filled), np.std(filled))

# print(' ')
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 889 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.6+ KB


In [76]:
# convert SibSp and Parch to two new columns: num_related and has_related
for df in [train, joined]:
    df['num_related'] = df['SibSp'] + df['Parch']
    df['has_related'] = df['num_related'] > 0

# check results
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,num_related,has_related
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,True


In [77]:
# convert Ticket to has_special_ticket; check if the Ticket value has any non-number characters: true if yes, false if no
for df in [train, joined]:
    df['has_special_ticket'] = df['Ticket'].str.extract(r"([A-Za-z])")
    df['has_special_ticket'].fillna(False, inplace=True)
    df['has_special_ticket'] = df['has_special_ticket'].astype(bool)
    df.drop('Ticket', inplace=True, axis=1)

# check results
train['has_special_ticket'].value_counts()

has_special_ticket
False    659
True     230
Name: count, dtype: int64

In [78]:
# just in case, create a separate table with only the rows where cabin is truthy
cabin_train = train[train['Cabin'].notna()]
cabin_test = joined[joined['Cabin'].notna()]

# note that the reduced cabin dataset still has >100 rows,
# meeting the threshold for the assignment
print(len(cabin_train), len(train))

202 889


In [79]:
# break cabin up into letter and number component
for df in [cabin_train, cabin_test]:
    df['cabin_deck'] = df['Cabin'].str.extract(r"([A-Za-z])")
    df['cabin_num'] = df['Cabin'].str.extract(r"([0-9]+)")
    df.drop('Cabin', inplace=True, axis=1)

for df in [train, joined]:
    df.drop('Cabin', inplace=True, axis=1)

In [80]:
# check results
# cabin_train['cabin_num'].value_counts()
cabin_train.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,num_related,has_related,has_special_ticket,cabin_deck,cabin_num
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,1,True,True,C,85
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,1,True,False,C,123
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,51.8625,S,0,False,False,E,46


In [81]:
# drop name (?)
for df in [cabin_train, cabin_test, train, joined]:
    df.drop('Name', inplace=True, axis=1)
    df.drop('PassengerId', inplace=True, axis=1)
train.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,num_related,has_related,has_special_ticket
0,0,3,male,22.0,1,0,7.25,S,1,True,True
1,1,1,female,38.0,1,0,71.2833,C,1,True,True


In [82]:
# create X and y variables for each set using target col
train_X = train.drop('Survived', axis=1)
train_y = train['Survived']

test_X = joined.drop('Survived', axis=1)
test_y = joined['Survived']

cabin_train_X = cabin_train.drop('Survived', axis=1)
cabin_train_y = cabin_train['Survived']

cabin_test_X = cabin_test.drop('Survived', axis=1)
cabin_test_y = cabin_test['Survived']

In [83]:
# use dummies for categorical columns; have to call out some explicitly
# ex: Pclass is a number, but is in fact categorical
dummy_cols = ['Pclass', 'Embarked', 'Sex']
dummy_train = pd.get_dummies(train_X, columns=dummy_cols)
dummy_test = pd.get_dummies(test_X, columns=dummy_cols)

# attempt to treat cabin_deck as categorical
dummy_cols.append('cabin_deck')
dummy_cabin_train = pd.get_dummies(cabin_train_X, columns=dummy_cols)
dummy_cabin_test = pd.get_dummies(cabin_test_X, columns=dummy_cols)

dummy_test.head(3)

Unnamed: 0,Age,SibSp,Parch,Fare,num_related,has_related,has_special_ticket,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,34.5,0,0,7.8292,0,False,False,False,False,True,False,True,False,False,True
1,47.0,1,0,7.0,1,True,False,False,False,True,False,False,True,True,False
2,62.0,0,0,9.6875,0,False,False,False,True,False,False,True,False,False,True


In [84]:
# use scaler to scale numerical data
scaler = StandardScaler()

train_scaler = scaler.fit(dummy_train)
train_scaled = train_scaler.transform(dummy_train)

test_scaler = scaler.fit(dummy_test)
test_scaled = test_scaler.transform(dummy_test)

cabin_train_scaler = scaler.fit(dummy_cabin_train)
cabin_train_scaled = cabin_train_scaler.transform(dummy_cabin_train)

cabin_test_scaler = scaler.fit(dummy_cabin_test)
cabin_test_scaled = cabin_test_scaler.transform(dummy_cabin_test)

In [85]:
# save X and y sets to CSV
names = ['train_scaled', 'test_scaled', 'cabin_train_scaled', 'cabin_test_scaled']

i = 0
for scaled in [train_scaled, test_scaled, cabin_train_scaled, cabin_test_scaled]:
  pd.DataFrame(scaled).to_csv(f"./Resources/{names[i]}_X.csv")
  i += 1

i = 0
for y in [train_y, test_y, cabin_train_y, cabin_test_y]:
  pd.DataFrame(y).to_csv(f"./Resources/{names[i]}_y.csv")
  i += 1

In [86]:
pd.DataFrame(test_scaled).head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,0.332257,-0.49947,-0.400248,-0.497811,-0.553443,-0.807573,-0.641999,-0.586559,-0.534933,0.957826,-0.568142,2.843757,-1.350676,-0.755929,0.755929
1,1.226552,0.616992,-0.400248,-0.51266,0.105643,1.238278,-0.641999,-0.586559,-0.534933,0.957826,-0.568142,-0.351647,0.74037,1.322876,-1.322876
2,2.299706,-0.49947,-0.400248,-0.464532,-0.553443,-0.807573,-0.641999,-0.586559,1.869391,-1.044031,-0.568142,2.843757,-1.350676,-0.755929,0.755929


In [87]:
# merge premade train/test sets to see if we
# can get distinct results with our own split
full_set = pd.concat([train, joined])
full_set = full_set[full_set['Fare'].notna()]
full_set = full_set.sample(frac=1).reset_index(drop=True)
full_set.to_csv("./Resources/Titanic_full_dataset.csv")
full_set.head(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,num_related,has_related,has_special_ticket
0,0,3,male,47.0,0,0,9.0,S,0,False,False
1,0,3,male,26.0,0,0,7.8958,S,0,False,False
2,1,1,male,45.0,0,0,26.55,S,0,False,False


In [88]:
# replicate process from above for this newly split set
X = full_set.drop('Survived', axis=1)
y = full_set['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y)

dummy_cols = ['Pclass', 'Embarked', 'Sex']
dummy_train_full = pd.get_dummies(X_train, columns=dummy_cols)
dummy_test_full = pd.get_dummies(X_test, columns=dummy_cols)

full_train_scaler = scaler.fit(dummy_train_full)
full_train_scaled = full_train_scaler.transform(dummy_train_full)

full_test_scaler = scaler.fit(dummy_test_full)
full_test_scaled = full_test_scaler.transform(dummy_test_full)

# names = ['reset_X_train', 'reset_X_test', 'reset_y_train', 'reset_y_test']
# i = 0
# for data in [full_train_scaled, full_test_scaled, y_train, y_test]:
#   pd.DataFrame(data).to_csv(f"./Resources/{names[i]}.csv")
#   i += 1

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
# create a Logistic Regression Model
lr_classifier = LogisticRegression(random_state = 1)

# fit the Logistic Regression Model
lr_model = lr_classifier.fit(full_train_scaled,y_train)

In [90]:
# Make predictons using the scaled data
lr_training_prediction = lr_model.predict(full_train_scaled)
lr_testing_prediction = lr_classifier.predict(full_test_scaled)

In [91]:
print("Confusion Matrix for training data")
print(confusion_matrix(y_train,lr_training_prediction))

print("Confusion Matrix for testing data")
print(confusion_matrix(y_test,lr_testing_prediction))

Confusion Matrix for training data
[[549  50]
 [ 83 297]]
Confusion Matrix for testing data
[[184  31]
 [ 27  85]]


In [92]:
print(classification_report(y_train,lr_training_prediction))

              precision    recall  f1-score   support

           0       0.87      0.92      0.89       599
           1       0.86      0.78      0.82       380

    accuracy                           0.86       979
   macro avg       0.86      0.85      0.85       979
weighted avg       0.86      0.86      0.86       979



In [93]:
print(classification_report(y_test,lr_testing_prediction))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86       215
           1       0.73      0.76      0.75       112

    accuracy                           0.82       327
   macro avg       0.80      0.81      0.80       327
weighted avg       0.82      0.82      0.82       327



**K Nearest Neighbors Classifier**

In [94]:
from sklearn.neighbors import KNeighborsClassifier
# create a K Nearest Neighbors Classifier Model
knn = KNeighborsClassifier(n_neighbors=5)
knn_model = knn.fit(full_train_scaled,y_train)

In [95]:
# Make predictions
knn_training_prediction = knn_model.predict(full_train_scaled)
knn_testing_prediction = knn.predict(full_test_scaled)

In [96]:
print("Training Classification Report:")
print(classification_report(y_train,knn_training_prediction))

              precision    recall  f1-score   support

           0       0.89      0.92      0.91       599
           1       0.87      0.82      0.85       380

    accuracy                           0.88       979
   macro avg       0.88      0.87      0.88       979
weighted avg       0.88      0.88      0.88       979



In [97]:
print("Testing Classification Report:")
print(classification_report(y_test,knn_testing_prediction))

              precision    recall  f1-score   support

           0       0.87      0.88      0.87       215
           1       0.76      0.74      0.75       112

    accuracy                           0.83       327
   macro avg       0.81      0.81      0.81       327
weighted avg       0.83      0.83      0.83       327



In [97]:
# def ModelEvaluation(model, full_train_scaled, y_train, full_test_scaled, y_test):
#     # Create model
#     if model == "KNeighborsClassifier)":
#       classifier = model(n_neighbors=5)
#     else:
#       classifier = LogisticRegression(random_state=1)


#     # Fit the model
#      model_fit= classifier.fit(full_train_scaled, y_train)

#     # Make predictions
#     training_prediction = model_fit.predict(full_train_scaled)
#     testing_prediction = model_fit.predict(full_test_scaled)

#     # Print classification report
#     print("Training Classification Report:")
#     print(classification_report(y_train, training_prediction))
#     print("Testing Classification Report:")
#     print(classification_report(y_test, testing_prediction))

