In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.metrics import classification_report

In [2]:
test_data = pd.read_csv('../data/titanic/test.csv')
train_data = pd.read_csv('../data/titanic/train.csv')
gender_sub = pd.read_csv('../data/titanic/gender_submission.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
train_data.Sex.value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [6]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [7]:
# get evaluation metrics of the model
print(classification_report(y, model.predict(X)))


              precision    recall  f1-score   support

           0       0.82      0.90      0.86       549
           1       0.80      0.69      0.74       342

    accuracy                           0.82       891
   macro avg       0.81      0.79      0.80       891
weighted avg       0.82      0.82      0.81       891



## DecisionTreeClassifier

In [8]:
train_data.columns 

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
X = train_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']].values

In [10]:
X[:5]

array([[3, 'male', 22.0, 1, 0, 7.25],
       [1, 'female', 38.0, 1, 0, 71.2833],
       [3, 'female', 26.0, 0, 0, 7.925],
       [1, 'female', 35.0, 1, 0, 53.1],
       [3, 'male', 35.0, 0, 0, 8.05]], dtype=object)

In [11]:
X_test = test_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']].values
X_test[:5]

array([[3, 'male', 34.5, 0, 0, 7.8292],
       [3, 'female', 47.0, 1, 0, 7.0],
       [2, 'male', 62.0, 0, 0, 9.6875],
       [3, 'male', 27.0, 0, 0, 8.6625],
       [3, 'female', 22.0, 1, 1, 12.2875]], dtype=object)

In [12]:
from sklearn import preprocessing
le_sex = preprocessing.LabelEncoder()
le_sex.fit(['female','male'])
X[:,1] = le_sex.transform(X[:,1]) 

le_sex_test = preprocessing.LabelEncoder()
le_sex_test.fit(['female','male'])
X_test[:,1] = le_sex_test.transform(X_test[:,1]) 


In [13]:
y = train_data["Survived"]
y[0:5]

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [14]:
model = tree.DecisionTreeClassifier(criterion="entropy", max_depth = 4)
model.fit(X, y)


In [15]:
predTree = model.predict(X_test)

In [16]:
print (predTree [0:10])

[0 1 0 0 1 0 0 0 1 0]


In [17]:
from sklearn import metrics
y_test = gender_sub["Survived"]
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_test, predTree))

DecisionTrees's Accuracy:  0.9521531100478469


In [18]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predTree})
output.to_csv('./submissions/submissionTT.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


## Try out other algorithm

In [22]:
test_data = pd.read_csv('../data/titanic/test.csv')
train_data = pd.read_csv('../data/titanic/train.csv')

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
train_data = train_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)
test_data = test_data.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

In [32]:
train_data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [31]:
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)

train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
test_data['Embarked'].fillna(test_data['Embarked'].mode()[0], inplace=True)

In [39]:
X_train = pd.get_dummies(train_data.drop(['Survived'], axis=1)).values
y_train = train_data['Survived']

X_test = pd.get_dummies(test_data).values

In [40]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_titanic_pipeline.py')

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.8327914129684265

Generation 2 - Current best internal CV score: 0.8339087314041805

Generation 3 - Current best internal CV score: 0.8361559224154165

Generation 4 - Current best internal CV score: 0.8361810306948717

Generation 5 - Current best internal CV score: 0.8406565815077522

Best pipeline: RandomForestClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.6000000000000001, min_samples_leaf=5, min_samples_split=14, n_estimators=100)


ValueError: Error: Input data is not in a valid format. Please confirm that the input data is scikit-learn compatible. For example, the features must be a 2-D array and target labels must be a 1-D array.