# The aim of this project is to analize this data and determine if the person would survive or not to the disaster using machine learning models.
# The competition and dataset is in: https://www.kaggle.com/c/titanic

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import operator
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

In [16]:
train = pd.read_csv("C://Users//johng//Desktop//data//competition//titanic//train.csv", encoding="utf-8")
test = pd.read_csv("C://Users//johng//Desktop//data//competition//titanic//test.csv", encoding="utf-8")

In [12]:
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].dropna()
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1, 0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", "Infant", "Child", "Teenager", "Young Adult", "Adult", "Senior"]

train = process_age(train, cut_points, label_names)
test = process_age(test,cut_points, label_names)

In [4]:
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

train = create_dummies(train,"Pclass")
test = create_dummies(test,"Pclass")

features = ["Sex", "Age_categories"]

for f in features:
    train = create_dummies(train, f)
    test = create_dummies(test, f)

In [5]:
columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

holdout = test # from now on we will refer to this
               # dataframe as the holdout data

In [6]:
X = train[columns]
y = train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
num_folds = [x for x in range(2, 21)]
for fold in num_folds:
    scores_l = cross_val_score(LogisticRegression(), X_train, y_train, cv=fold)
    scores_svm = cross_val_score(SVC(), X_train, y_train, cv=fold)
    scores_rf = cross_val_score(RandomForestClassifier(n_estimators=fold), X_train, y_train, cv=fold)
    scores_knn = cross_val_score(KNeighborsRegressor(n_neighbors=fold), X_train, y_train, cv=fold)
    
scores_l_mean = np.mean(scores_l)
scores_svm_mean = np.mean(scores_svm)
scores_rf_mean = np.mean(scores_rf)
scores_knn_mean = np.mean(scores_knn)

print("Logistic regression: ", scores_l_mean)
print("SVM: ", scores_svm_mean)
print("Random Forest: ", scores_rf_mean)
print("KNN: ",scores_knn_mean)

Logistic regression:  0.8034920634920635
SVM:  0.801984126984127
Random Forest:  0.8019444444444446
KNN:  0.35497212362268465


In [8]:
lr = LogisticRegression()
lr.fit(X, y)
holdout_predictions = lr.predict(holdout[columns])

# Logistic Regression: accuracy in Kaggle: 0.76315 against my accuracy with K-Fold of 0.80349. This indicates that the model is overfitting (high accuracy with seem data but not high accuracy over new data).

In [9]:
svm = SVC()
svm.fit(X, y)
holdout_predictions = svm.predict(holdout[columns])

# SVM: accuracy in Kaggle - 0.76555 against my accuracy with K-Fold of 0.80198. This indicates that the model is overfitting (high accuracy with seem data but not high accuracy over new data).

In [10]:
rf = RandomForestClassifier()
rf.fit(X, y)
holdout_predictions = rf.predict(holdout[columns])

# Random forest: accuracy in Kaggle - 0.76794 against my accuracy with K-Fold of 0.80619. This indicates that the model is overfitting (high accuracy with seem data but not high accuracy over new data).

# Submission file:

In [11]:
submission_df = {"PassengerId": holdout["PassengerId"],
                 "Survived": holdout_predictions}
submission = pd.DataFrame(submission_df)
submission.to_csv("C://Users//johng//Desktop//data//competition//titanic//submission.csv",index=False)

# ---------------------------------------------------------------------------------------------------------