In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
test_data, train_data = pd.read_csv(r'./data/raw/test.csv'), pd.read_csv(r'./data/raw/train.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
print(train_data.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [5]:
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())

In [6]:
def ticket_cleaner(column):
    cleaned = []
    for i in column:
        if ' ' in i:
            text_split = i.split(' ')
            new_val = text_split[len(text_split)-1]
        else:
            new_val = i                           
        try:
            cleaned.append(int(new_val))
        except:
            print(f'{new_val} dirived from {i} cannot be converted to int. Appending 99999')
            cleaned.append(99999)
    print('\n')
    return cleaned

In [7]:
train_data['ticket_cleaned'] = ticket_cleaner(train_data['Ticket'])
test_data['ticket_cleaned'] = ticket_cleaner(test_data['Ticket'])

LINE dirived from LINE cannot be converted to int. Appending 99999
LINE dirived from LINE cannot be converted to int. Appending 99999
LINE dirived from LINE cannot be converted to int. Appending 99999
LINE dirived from LINE cannot be converted to int. Appending 99999






In [8]:
print(f'Staring test_data shape{test_data.shape}, Starting train_data shape {train_data.shape}')
train_data = train_data.loc[train_data['ticket_cleaned'] != 99999]
test_data = test_data.loc[test_data['ticket_cleaned'] != 99999]
print(f'End test_data shape{test_data.shape}, End train_data shape {train_data.shape}')

Staring test_data shape(418, 12), Starting train_data shape (891, 13)
End test_data shape(418, 12), End train_data shape (887, 13)


In [9]:
y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch", "Embarked"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

X['Age'], X_test['Age'] = train_data.Age, test_data.Age
X['Ticket'], X_test['Ticket'] = train_data.ticket_cleaned, test_data.ticket_cleaned

In [10]:
model = RandomForestClassifier(n_estimators=100, max_depth=100, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

model.score(X, y)

0.9988726042841037

In [11]:
submission = pd.DataFrame.from_dict({'PassengerId' : test_data['PassengerId'], 'Survived' : predictions})
submission.to_csv(r'titanic_submission_random_forest.csv', index=False)
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         1
4          896         1
