In [103]:
import pandas as pd
import os
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.naive_bayes import GaussianNB

Loading in training dataset

In [104]:
train_data = pd.read_csv('train.csv')
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Dropping the following columns: Name, Ticket, Cabin, Embarked. The name and ticket number should not have any bearing on whether or not a person survived. Any correlation would be circumstantial at best. As for Cabin and Embarked, any correlation to survival rate would be covered by other columns (for example, nicer cabins would have the same correlation as the fare or passenger class).

Also, we are dropping null values from the Age column, as well as rounding any decimal numbers.

In [105]:
train_data.drop(['PassengerId','Name','Ticket','Cabin','Embarked','SibSp'], axis=1, inplace=True)
print(train_data.shape)
train_data = train_data.fillna(train_data['Age'].mean())
print(train_data.shape)
train_data['Age'] = train_data['Age'].round()
train_data['Age'] = train_data['Age'].astype(int)
print(train_data.head())

(891, 6)
(891, 6)
   Survived  Pclass     Sex  Age  Parch     Fare
0         0       3    male   22      0   7.2500
1         1       1  female   38      0  71.2833
2         1       3  female   26      0   7.9250
3         1       1  female   35      0  53.1000
4         0       3    male   35      0   8.0500


Now that we have removed excess columns and done some reformatting, it is time to start feature engineering. Let's turn the Sex column into an integer column, with 1 for Male and 2 for Female.

In [106]:
train_data['Sex'] = train_data['Sex'].replace({'male': 1, 'female': 2}).astype(int)
print(train_data.head())

   Survived  Pclass  Sex  Age  Parch     Fare
0         0       3    1   22      0   7.2500
1         1       1    2   38      0  71.2833
2         1       3    2   26      0   7.9250
3         1       1    2   35      0  53.1000
4         0       3    1   35      0   8.0500


In [107]:
Y = train_data['Survived']
train_features = ['Pclass','Sex','Age','Parch','Fare']
X = train_data[train_features]

With our features and label split, we can start building the model.

In [108]:
x_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1) # 70% training and 30% test

In [109]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(x_train,y_train)

Let's see how it predicts survival rate

In [110]:
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print(len(y_pred))

268


In [111]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.7425373134328358


The model performed well against our test data, so let's predict based on our actual test data.

In [112]:
test_data = pd.read_csv('test.csv')
test_data.drop(['PassengerId','Name','Ticket','Cabin','Embarked'], axis=1, inplace=True)
test_data = test_data.fillna(test_data['Age'].mean())
print(test_data.shape)
test_data['Age'] = test_data['Age'].round()
test_data['Age'] = test_data['Age'].astype(int)
test_data['Sex'] = test_data['Sex'].replace({'male': 1, 'female': 2}).astype(int)
x_test = test_data[train_features]
print(x_test.head())

(418, 6)
   Pclass  Sex  Age  Parch     Fare
0       3    1   34      0   7.8292
1       3    2   47      0   7.0000
2       2    1   62      0   9.6875
3       3    1   27      0   8.6625
4       3    2   22      1  12.2875


In [113]:
#Predict the response for test dataset
y_pred = clf.predict(x_test)
print(len(y_pred))

418


In [114]:
test_data = pd.read_csv('test.csv')
final = pd.DataFrame({'PassengerId': test_data['PassengerId'],
                     'Survived': y_pred})
print(final.head())
final.to_csv('DecisionTreeModelPredictions.csv', index=False)

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1


Let's try a Naive Bayes Model this time

In [115]:
#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets 
model.fit(x_train, y_train)

#Predict Output 
y_pred= model.predict(X_test)
print(y_pred)

[1 0 1 1 1 0 0 1 1 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 0
 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 0 1 0
 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 1 0 1 1 0 1 0 0
 1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 1 0 0 1
 0 0 1 0 1 0 0 1 1 1 1 0 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0
 0 0 0 1 1 0 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 0
 1 1 0 1 1 0 0 1 0]


In [116]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.753731343283582


In [119]:
y_pred= model.predict(x_test)
print(y_pred)

[0 1 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 1
 1 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 1 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 1 1 0 1 1 0 1
 0 1 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 1 0 0 0 0 1 1 0 1 1 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 0 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 0 1 1 1 0 0 1 0 0 0 1 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0
 1 0 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 1 1 1 1 1 0 1 0 0 0]


In [120]:
finalNaive = pd.DataFrame({'PassengerId': test_data['PassengerId'],
                          'Survived': y_pred})
finalNaive.to_csv('NaiveModelPredictions.csv',index=False)