# Titanic Data Science Project
- https://www.kaggle.com/c/titanic/

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from os import path

train_path = path.join('res', 'titanic', 'train.csv')
test_path = path.join('res', 'titanic', 'test.csv')

In [None]:
train_data = pd.read_csv(train_path)
display(train_data.sample(5))

In [None]:
test_data = pd.read_csv(test_path)
display(test_data.sample(5))

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
display(train_data.describe(include='all'))

In [None]:
print(train_data.info())

In [None]:
sns.barplot(x='Embarked', y='Survived', hue='Sex', data=train_data)
plt.show()

In [None]:
sns.pointplot(x='Pclass', y='Survived', hue='Sex', data=train_data)
plt.show()

In [None]:
so = list(train_data.Age.unique())
so.sort()
print(so)

In [None]:
train_data.hist(column='Age')
plt.show()

In [None]:
def cat_age(data):
    data['Age'] = data['Age'].fillna(-0.5)
    bins = (-1, 0, 12, 18, 30, 60, 120)
    cat_names = ('Unknown', 'Child', 'Teenager', 'Young Adult', 'Adult', 'Elderly')
    categories = pd.cut(data['Age'], bins, labels = cat_names)
    data['Age'] = categories
    return

In [None]:
cat_age(train_data)
cat_age(test_data)

train_data['Age'].sample(10)

In [None]:
sns.countplot(x='Cabin', data=train_data)
plt.show()

In [None]:
train_data['Cabin'].sample(10)

In [None]:
def extract_cabin(data):
    data['Cabin'] = data['Cabin'].fillna('N')
    data['Cabin'] = data['Cabin'].apply(lambda x : x[0])

In [None]:
extract_cabin(train_data)
extract_cabin(test_data)

train_data['Cabin'].sample(10)

In [None]:
train_data['Fare'].sample(10)

In [None]:
def cat_fare(data):
    data['Fare'] = data['Fare'].fillna(0.0)
    cat_names = ['1st', '2nd', '3rd', '4th', '5th']
    data['Fare'] = pd.qcut(data['Fare'], 5, labels = cat_names)

In [None]:
cat_fare(train_data)
cat_fare(test_data)

train_data['Fare'].sample(10)

In [None]:
train_data['Name'].sample(10)

In [None]:
def extract_title(data):
    data['Title'] = data['Name'].apply(lambda x : x.split(' ')[1])

In [None]:
extract_title(train_data)
extract_title(test_data)

train_data['Title'].sample(10)

In [None]:
train_data.sample(5)

In [None]:
def drop_columns(data, columns):
    data.drop(columns, axis = 1, inplace=True)

In [None]:
columns_to_drop = ['Name', 'Ticket', 'Embarked']

drop_columns(train_data, columns_to_drop)
drop_columns(test_data,columns_to_drop)

train_data.sample(5)

In [None]:
sns.barplot(x='Age',y='Survived',hue='Sex',data=train_data)

In [None]:
sns.barplot(x='Fare',y='Survived',hue='Sex',data=train_data)

In [None]:
train_data.head(5)

In [None]:
from sklearn import preprocessing

In [None]:
def encode_features(df_train, df_test, features):
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])

In [None]:
features = ['Sex', 'Age', 'Fare', 'Cabin', 'Title']
encode_features(train_data, test_data, features)
train_data.head(5)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = train_data.drop('Survived', axis=1)
Y = train_data['Survived']

validation_size = 0.15
seed = np.random.randint(1000)
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

In [None]:
from sklearn.metrics import accuracy_score

prediction = model.predict(X_validation)

print(prediction)
print(accuracy_score(prediction, Y_validation))

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(Y_validation, prediction))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(Y_validation, prediction))

In [None]:
train_data.head(5)

In [None]:
X = train_data.drop(['Survived', 'PassengerId'], axis=1) 
Y = train_data['Survived']

validation_size = 0.15
seed = np.random.randint(1000)
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

prediction = model.predict(X_validation)

print(prediction)
print(accuracy_score(prediction, Y_validation))

In [None]:
n_iters = 10
results = []
for i in range(n_iters):
    seed = np.random.randint(1000)
    X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)
    
    model = DecisionTreeClassifier()
    model.fit(X_train, Y_train)

    prediction = model.predict(X_validation)
    results.append(accuracy_score(prediction, Y_validation))
    
print(results)
print(sum(results) / len(results))