# Importing necessary libraries (numpy, matplotlib, pandas, sklearn)

In [601]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline
matplotlib.style.use('ggplot')

# Loading the data

In [597]:
# Loading train data and test data
data_train = pd.read_csv('./train.csv')
data_test = pd.read_csv('./test.csv')

## Cleaning the data

In [598]:
X_train = data_train.drop("Survived", axis=1)
Y_train = data_train["Survived"]
X_test  = data_test.drop("PassengerId", axis=1).copy()


#Function for splitting ages into 3 categories (child, young, old, very_old)
def split_ages(age):
    if age < 18: return 0 #child
    elif age >= 18 and age < 32: return 1 #young
    elif age >= 32 and age < 56: return 2 #old
    elif age >= 56: return 3 #very_old


# Removing useless columns and doing some useful transformations
X_train = X_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], 1) # Removing columns that don't serve us
X_train['Sex'] = X_train['Sex'].map({'female': 1, 'male': 0} ).astype(int) # Getting dummies variables from Sex
X_train["Age"] = X_train["Age"].fillna(int(X_train["Age"].mean())) # Filling missing ages
X_train["Age"] = X_train["Age"].map(split_ages) #Splitting ages into categories
X_train = pd.concat([X_train, pd.get_dummies(data_train["Embarked"])], axis=1) # Getting dummies variables from Embarked

X_test = X_test.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], 1)
X_test['Sex'] = X_test['Sex'].map({'female': 1, 'male': 0} ).astype(int)
X_test = X_test.fillna(0)
X_test["Age"] = X_test["Age"].map(split_ages)
X_test = pd.concat([X_test, pd.get_dummies(data_test["Embarked"])], axis=1)

## Creating the model and fitting it

In [599]:
#classifier = KNeighborsClassifier(n_neighbors = 7)
#classifier = LogisticRegression()
#classifier = tree.DecisionTreeClassifier()
classifier = RandomForestClassifier(n_estimators=100)

In [600]:
classifier.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

## Trying to predict data from the test set

In [594]:
Y_pred = classifier.predict(X_test)

## Measuring the accuracy of the model

In [595]:
#score = classifier.score(X_train, Y_train)
score = cross_val_score(classifier, X_train, Y_train, cv=30)
print("Accuracy: %0.2f (+/- %0.2f)" % (score.mean(), score.std() * 2))

Accuracy: 0.82 (+/- 0.12)


## Saving results into a file

In [584]:
output = open("output.csv", 'w')
output.write("PassengerId,Survived\n")
for i in range(len(Y_pred)):
    output.write("{},{}\n".format(data_test["PassengerId"][i], Y_pred[i]))
output.close()