# Kaggle Titanic

In [1]:
import pandas as pd
import numpy as np

# Pre-processing

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_id = test['PassengerId']

In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
def clean(df):
    
    drop = ['PassengerId', 'Name', 'Ticket', 'Cabin']
    
    df = df.drop(drop, axis=1)
    
    cols = ['Age', 'SibSp', 'Parch', 'Fare']
    for col in cols:
        df[col] = df[col].fillna(df[col].median())
    
    df['Embarked'] = df['Embarked'].fillna('U')
    
    return df

train = clean(train)
test = clean(test)

In [5]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
from sklearn import preprocessing

In [7]:
le = preprocessing.LabelEncoder()

cols = ['Sex', 'Embarked']
for col in cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


# Training

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [9]:
X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [10]:
rfc = RandomForestClassifier(random_state=0)

param_grid = {
    'n_estimators': [10, 50, 100, 200], 
    'max_depth': [3, 5, 7, 10], 
    'max_features': ['sqrt', 'log2']
}


cv = GridSearchCV(estimator=rfc, param_grid=param_grid).fit(X_train, y_train)

print(cv.best_params_)
print(cv.best_score_)

{'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 50}
0.8263606778139378


In [11]:
res = cv.predict(test)

In [12]:
res_df = pd.DataFrame({'PassengerId': test_id, 'Survived': res})
res_df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [13]:
res_df.to_csv('submission.csv', index=False)