# Titanic Survival Predicitons

Goal is to use training data to predict who would suvive the Titanic

In [38]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('data/train_clean.csv')
test = pd.read_csv('data/test_clean.csv')

train.head()
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


# Working with categorical data

Sex and Embarked are both categorical.  Need to change 'em up.  Pandas get_dummies should work fine for this data.

In [39]:
train = pd.get_dummies(train, drop_first=True)
test = pd.get_dummies(test, drop_first=True)

train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.25,1,0,1
1,1,1,38.0,1,0,71.2833,0,0,0
2,1,3,26.0,0,0,7.925,0,0,1
3,1,1,35.0,1,0,53.1,0,0,1
4,0,3,35.0,0,0,8.05,1,0,1


# Split into X,y

In [40]:
X_cols = list(train.columns)
X_cols.remove('Survived')

X_train = train[X_cols]
y_train = train['Survived']

X_test = test[X_cols]

X_train.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,1,0,1
1,1,38.0,1,0,71.2833,0,0,0
2,3,26.0,0,0,7.925,0,0,1
3,1,35.0,1,0,53.1,0,0,1
4,3,35.0,0,0,8.05,1,0,1


# Model it

Start with a straightforward logistic regression.

In [41]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0).fit(X_train, y_train)
print(clf.predict_proba(X_train))
clf.score(X_train, y_train)

[[0.89998909 0.10001091]
 [0.10861979 0.89138021]
 [0.38117303 0.61882697]
 ...
 [0.56175197 0.43824803]
 [0.5086539  0.4913461 ]
 [0.8629035  0.1370965 ]]


0.8024691358024691

# Predict test data

In [42]:
test['Survived'] = clf.predict(X_test)
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S,Survived
0,892,3,34.5,0,0,7.8292,1,1,0,0
1,893,3,47.0,1,0,7.0,0,0,1,0
2,894,2,62.0,0,0,9.6875,1,1,0,0
3,895,3,27.0,0,0,8.6625,1,0,1,0
4,896,3,22.0,1,1,12.2875,0,0,1,1


# Create submission file

In [45]:
test[['PassengerId', 'Survived']].to_csv('data/titanic_predictions.csv',index=False)