#### Set up Kaggle and Load Dataset

In [108]:
# Set up code for Kaggle API and authentication

! pip install kaggle
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [22]:
# Download titanic dataset
! kaggle competitions download titanic

titanic.zip: Skipping, found more recently modified local copy (use --force to force download)


In [23]:
# Unzip titanic.zip file into three csv files
!unzip titanic.zip

Archive:  titanic.zip
replace gender_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

#### Preview Data

In [109]:
# Import neeed libraries
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [163]:
# Read csv files as pd.dataframe. 

from sklearn.model_selection import train_test_split

total_data = pd.read_csv("train.csv")
final_test_data = pd.read_csv("test.csv")
gender_data = pd.read_csv("gender_submission.csv")

In [111]:
# Data cleaning to remove NaN values

for column in total_data.columns:
    total_data[column].fillna(total_data[column].mode()[0], inplace=True)

In [112]:
# Affirmm clearance of NaN values

print("Number of NaN values in each column: \n")

for column in total_data.columns: 
  print(column, total_data[column].isnull().sum())

Number of NaN values in each column: 

PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 0
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 0
Embarked 0


In [113]:
# View train_data set
# Based on the below, we will ignore "Cabin" due to many missing values. We will also ignore "Ticket" due to the massive numbers of categorical values

total_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,B96 B98,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,B96 B98,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,B96 B98,S


In [114]:
# This is missing the "Survived" column. The model we produce must predict this column.

final_test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [None]:
# Predictions should be formatted in a dataframe like below.

gender_data.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [None]:
# Data Types

print(total_data.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [115]:
# Cast float64 data types to float32 in order to be compatible with sklearn estimators

total_data[["Age"]] = total_data[["Age"]].astype('float32')
total_data[["Fare"]] = total_data[["Fare"]].astype('float32')

In [116]:
# Convert Categorical Number columns "Pclass", "Sibsp", and "Parch" to strings, so that the get_dummies function will one-hot encode them. 

total_data[["Pclass"]] = total_data[["Pclass"]].astype(str)
total_data[["SibSp"]] = total_data[["SibSp"]].astype(str)
total_data[["Parch"]] = total_data[["Parch"]].astype(str)

In [48]:
# Adjusted Data Types
print(total_data.dtypes)

PassengerId      int64
Survived         int64
Pclass          object
Name            object
Sex             object
Age            float32
SibSp           object
Parch           object
Ticket          object
Fare           float32
Cabin           object
Embarked        object
dtype: object


In [118]:
# Implement One-Hot Encoding on dataset

features = ["Pclass", "Survived", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

total_data = total_data[features]

total_data_dummies = pd.get_dummies(total_data)

In [119]:
total_data_dummies.head()

Unnamed: 0,Survived,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,SibSp_0,SibSp_1,...,Parch_0,Parch_1,Parch_2,Parch_3,Parch_4,Parch_5,Parch_6,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,7.25,0,0,1,0,1,0,1,...,1,0,0,0,0,0,0,0,0,1
1,1,38.0,71.283302,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,1,0,0
2,1,26.0,7.925,0,0,1,1,0,1,0,...,1,0,0,0,0,0,0,0,0,1
3,1,35.0,53.099998,1,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,1
4,0,35.0,8.05,0,0,1,0,1,1,0,...,1,0,0,0,0,0,0,0,0,1


In [121]:
# Check the unique counts of entries in each column

print("Pclass: \n{} \n".format(total_data["Pclass"].value_counts()))
print("SibSp: \n{} \n".format(total_data["SibSp"].value_counts()))
print("Parch: \n{} \n".format(total_data["Parch"].value_counts()))
print("Embarked: \n{} \n".format(total_data["Embarked"].value_counts()))

Pclass: 
3    491
1    216
2    184
Name: Pclass, dtype: int64 

SibSp: 
0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64 

Parch: 
0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64 

Embarked: 
S    646
C    168
Q     77
Name: Embarked, dtype: int64 



In [122]:
print(total_data_dummies.columns)

Index(['Survived', 'Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3',
       'SibSp_4', 'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2',
       'Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


In [123]:
# Split dataset into training and test sets for one-hot encoded data. 

X_features = ['Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3','Sex_female', 'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3','SibSp_4', 'SibSp_5', 
              'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2','Parch_3', 'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q','Embarked_S']
y_label = ["Survived"]

X_dummies = total_data_dummies[X_features]
y_dummies = total_data_dummies[y_label]

train_data, test_data = train_test_split(total_data_dummies, test_size = 0.2, random_state=0)
X_train_dummies = train_data[X_features]
y_train_dummies = train_data[y_label]

X_test_dummies = test_data[X_features]
y_test_dummies = test_data[y_label]

In [124]:
# Shape of each dataframe

print("train_data.shape: {}".format(train_data.shape))
print("test_data.shape: {}".format(test_data.shape))
print("final_test_data.shape: {}".format(final_test_data.shape))
print("gender_data.shape: {}".format(gender_data.shape))

train_data.shape: (712, 25)
test_data.shape: (179, 25)
final_test_data.shape: (418, 11)
gender_data.shape: (418, 2)


In [None]:
# Assess correlation between gender and survival rate. 

women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [None]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


#### Random Forest

In [162]:
# Implement simple random forest classifier.

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train_dummies, y_train)
print("Model Accuracy on Training Set: {}".format(forest.score(X_train_dummies, y_train_dummies)))
print("Model Accuracy on Test Set: {}".format(forest.score(X_test_dummies, y_test_dummies)))

  


Model Accuracy on Training Set: 0.9803370786516854
Model Accuracy on Test Set: 0.8379888268156425


In [76]:
# Based on the accuracy results, the RandomForestClassifier seems to be overfitting.
# We will conduct a GridSearchCV to tune the hyperparameters. 

from sklearn.model_selection import GridSearchCV

param_grid = {"n_estimators" : [1000], "max_depth" : [5, 10, 20], "max_features" : [2, 5, 10]}

forest1 = RandomForestClassifier(random_state=0) 
grid_search = GridSearchCV(forest1, param_grid)

grid_search.fit(X_train_dummies, y_train_dummies)
print("Test set score: {:.2f}".format(grid_search.score(X_test_dummies, y_test_dummies)))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

Test set score: 0.83


#### Method 2: Attempt an Support Vector Classifier with preprocessing: MinMaxScaler and PolynomialFeatures

In [91]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures

pipe = make_pipeline(MinMaxScaler(), PolynomialFeatures(), SVC())

param_grid = {"polynomialfeatures__degree" : [1,2,3], "svc__C" : [.1, 1, 10, 100, 1000], "svc__gamma" : [0.1, 1, 10, 100]}

grid = GridSearchCV(pipe, param_grid)

grid.fit(X_train_dummies, y_train_dummies)

print("Test set score: {:.2f}".format(grid.score(X_test_dummies, y_test_dummies)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

Test set score: 0.80


  y = column_or_1d(y, warn=True)


In [164]:
id = final_test_data.PassengerId

In [165]:
# The Random Forest Classifier worked slightly better so we will use that model as our final model. 

# Process the final_test_data the same way we processed the train data. 


final_test_data[["Age"]] = final_test_data[["Age"]].astype('float32')
final_test_data[["Fare"]] = final_test_data[["Fare"]].astype('float32')

final_test_data[["Pclass"]] = final_test_data[["Pclass"]].astype(str)
final_test_data[["SibSp"]] = final_test_data[["SibSp"]].astype(str)
final_test_data[["Parch"]] = final_test_data[["Parch"]].astype(str)

features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

final_test_data = final_test_data[features]

final_test_data_dummies = pd.get_dummies(final_test_data)

print(final_test_data_dummies.dtypes)

print("\nNumber of inf values: {}".format(np.isinf(final_test_data_dummies).values.sum()))

Age           float32
Fare          float32
Pclass_1        uint8
Pclass_2        uint8
Pclass_3        uint8
Sex_female      uint8
Sex_male        uint8
SibSp_0         uint8
SibSp_1         uint8
SibSp_2         uint8
SibSp_3         uint8
SibSp_4         uint8
SibSp_5         uint8
SibSp_8         uint8
Parch_0         uint8
Parch_1         uint8
Parch_2         uint8
Parch_3         uint8
Parch_4         uint8
Parch_5         uint8
Parch_6         uint8
Parch_9         uint8
Embarked_C      uint8
Embarked_Q      uint8
Embarked_S      uint8
dtype: object

Number of inf values: 0


In [156]:
print(X_train_dummies.columns)

Index(['Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


In [157]:
print(final_test_data_dummies.columns)

Index(['Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'Parch_9', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')


In [166]:
# Remove Parch_9 from final_test_data_dummies

final_test_data_dummies = final_test_data_dummies[['Age', 'Fare', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female',
       'Sex_male', 'SibSp_0', 'SibSp_1', 'SibSp_2', 'SibSp_3', 'SibSp_4',
       'SibSp_5', 'SibSp_8', 'Parch_0', 'Parch_1', 'Parch_2', 'Parch_3',
       'Parch_4', 'Parch_5', 'Parch_6', 'Embarked_C', 'Embarked_Q',
       'Embarked_S']]

In [167]:
# Data cleaning to remove NaN values

for column in final_test_data_dummies.columns:
    final_test_data_dummies[column].fillna(final_test_data_dummies[column].mode()[0], inplace=True)

In [168]:
print(final_test_data_dummies.dtypes)

print("\nNumber of inf values: {}\n".format(np.isinf(final_test_data_dummies).values.sum()))

print("Number of NaN values in each column: \n")

for column in final_test_data_dummies.columns: 
  print(column, final_test_data_dummies[column].isnull().sum())

Age           float32
Fare          float32
Pclass_1        uint8
Pclass_2        uint8
Pclass_3        uint8
Sex_female      uint8
Sex_male        uint8
SibSp_0         uint8
SibSp_1         uint8
SibSp_2         uint8
SibSp_3         uint8
SibSp_4         uint8
SibSp_5         uint8
SibSp_8         uint8
Parch_0         uint8
Parch_1         uint8
Parch_2         uint8
Parch_3         uint8
Parch_4         uint8
Parch_5         uint8
Parch_6         uint8
Embarked_C      uint8
Embarked_Q      uint8
Embarked_S      uint8
dtype: object

Number of inf values: 0

Number of NaN values in each column: 

Age 0
Fare 0
Pclass_1 0
Pclass_2 0
Pclass_3 0
Sex_female 0
Sex_male 0
SibSp_0 0
SibSp_1 0
SibSp_2 0
SibSp_3 0
SibSp_4 0
SibSp_5 0
SibSp_8 0
Parch_0 0
Parch_1 0
Parch_2 0
Parch_3 0
Parch_4 0
Parch_5 0
Parch_6 0
Embarked_C 0
Embarked_Q 0
Embarked_S 0


In [169]:
# Compute predictions

predictions = forest.predict(final_test_data_dummies)

# Output a completed file of predictions.

output = pd.DataFrame({'PassengerId': id, 'Survived': predictions})

output.to_csv('submission.csv', index=False)