In [181]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [182]:
# read the data
X_full = pd.read_csv("/kaggle/input/titanic/train.csv")
X_test = pd.read_csv("/kaggle/input/titanic/test.csv")

# remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['Survived'], inplace=True)
y = X_full.Survived
X_full.drop(['Survived'], axis=1, inplace=True)

# break off validation set from training data
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_full, y, train_size=0.8, test_size=0.2)

In [183]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
284,285,1,"Smith, Mr. Richard William",male,,0,0,113056,26.0,A19,S
546,547,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19.0,1,0,2908,26.0,,S
858,859,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24.0,0,3,2666,19.2583,,C
652,653,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8475,8.4333,,S
101,102,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S


In [184]:
# shape of training data
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

(712, 11)
Age         152
Cabin       555
Embarked      2
dtype: int64


In [185]:
# choose features
features = ['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare'] 
X_train = pd.get_dummies(X_train[features])
X_valid = pd.get_dummies(X_valid[features])
X_test2 =pd.get_dummies(X_test[features])

In [186]:
# Imputation: handle missing values

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer(strategy='median') 
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))
imputed_X_test = pd.DataFrame(my_imputer.transform(X_test2))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns
imputed_X_test.columns = X_test2.columns

ValueError: Length mismatch: Expected axis has 7 elements, new values have 11 elements

In [None]:
from sklearn.ensemble import RandomForestClassifier

# define models
model_1 = RandomForestClassifier(n_estimators=100, max_depth = 5, random_state=0)
model_2 = RandomForestClassifier(n_estimators=100, max_depth = 10, random_state=0)
model_3 = RandomForestClassifier(n_estimators=100, max_depth = 15, random_state=0)
model_4 = RandomForestClassifier(n_estimators=100, max_depth = 20, random_state=0)
model_5 = RandomForestClassifier(n_estimators=100, max_depth=25, random_state=0)

models = [model_1, model_2, model_3, model_4, model_5]

In [None]:
# Function for comparing different models
def score_model(model, X_t, X_v, y_t, y_v):
    model.fit(X_t, y_t)
    preds = model.predict(X_v)
    return sum(y_v == preds)/len(y_v)

In [None]:
for i in range(0, len(models)):
    acc = score_model(models[i], imputed_X_train, imputed_X_valid, y_train, y_valid)
    print("Model %d accuracy: %f" % (i+1, acc))

In [None]:
# so let's choose the model1
model_1.fit(imputed_X_train, y_train)
preds = model_1.predict(imputed_X_test)

In [None]:
output = pd.DataFrame({'PassengerId': X_test.PassengerId, 'Survived': preds})
output.to_csv('my_submission.csv', index=False)
print("My submission was successfully saved!")