<a href="https://colab.research.google.com/github/krymsza/Titanic-Machine-Learning-from-Disaster/blob/main/titanic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Titanic - Machine Learning from Disaster**

[Kaggle competition](https://www.kaggle.com/competitions/titanic )

**[Score: 0.78229]**

Data description


* **survival**	Survival	0 = No, 1 = Yes
* **pclass**	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
* **sex**	Sex
* **Age**	Age in years
* **sibsp**	# of siblings / spouses aboard the Titanic
* **parch**	# of parents / children aboard the Titanic
* **ticket**	Ticket number
* **fare**	Passenger fare
* **cabin**	Cabin number
* **embarked**	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

## prepare env

In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
import sklearn.svm as svm
import math
import re
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV

#from sklearn.model_selection import StratifiedKFold, cross_val_score
#from sklearn.metrics import  classification_report, confusion_matrix

# Machine Learning Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
test_data = pd.read_csv("./test.csv")
train_data = pd.read_csv("./train.csv")
#train_data.head()

# inspect & prepare data

## inspect

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
#rows
train_data.shape[0]

In [None]:
train_data.dtypes

In [None]:
test_data.isnull().sum()

train_data.isnull().sum()

In [None]:
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
data.head(2)

In [None]:
for column_name in ['Sex','Embarked','Pclass', 'SibSp', 'Parch']:
    print(column_name)
    sns.countplot(data=data, x=column_name, hue='Survived')
    print("")
    plt.show()

## Filling missing values

#### **Age**

We need to fill in the missing values in the age column. Let's see how they compare to Pclass and Sex values

In [None]:
sns.boxplot(x='Pclass', y='Age', data = data)

In [None]:
g = sns.catplot(
    data=data, kind="box",
    x="Sex", y="Age", hue="Pclass"
)
g.despine(left=True)

As you can see in the chart above, there is a noticeable difference in age values across classes with respect to gender. We can complete the age values with reference to the Pclass value and gender.

In [None]:
data_mean = data.groupby(['Sex','Pclass'], as_index=False)['Age'].mean()
print (data_mean)

In [None]:
def fill_age(row):
    age = row[0]
    pclass = row[1]
    sex=row[2]
    if pd.isnull(age):
        if pclass==1 and sex=='male':
            return 41.02
        elif pclass==1 and sex=='female':
            return 37.03
        elif pclass == 2 and sex=='male':
            return 30.81
        elif pclass == 2 and sex=='female':
            return 27.49
        elif pclass == 3 and sex=='male':
            return 25.96
        else:
            return 22.18
    else:
        return age

In [None]:
data.isnull().sum()

In [None]:
data['Age'] =  data[['Age', 'Pclass','Sex']].apply(fill_age, axis=1)

#### **Fare**

In [None]:
data['Fare'] =  data['Fare'].fillna(data['Fare'].mean())

#### **Embarked**

In [None]:
region = ['Southampton (S)','Cherbourg (C)','Queenstown (Q)']
for i in range(0,3):
    print('Number of people embarking in {} : {}'.format(
        region[i],data[data.Embarked==data.Embarked.unique()[i]].shape[0]))

In [None]:
data.fillna({'Embarked':'S'},inplace=True)

In [None]:
data = data.drop(["PassengerId"], axis=1)

In [None]:
data.isnull().sum()

##  Categorical  values

In [None]:
def one_hot_encoder_two(data,feature,keep_first=True):

    one_hot_cols = pd.get_dummies(data[feature])

    for col in one_hot_cols.columns:
        one_hot_cols.rename({col:f'{feature}_'+col},axis=1,inplace=True)

    new_data = pd.concat([data,one_hot_cols],axis=1)
    new_data.drop(feature,axis=1,inplace=True)

    if keep_first == False:
        new_data=new_data.iloc[:,1:]

    return new_data

In [None]:
data = one_hot_encoder_two(data, 'Embarked')

### feature engeneering

#### **Cabin -> Deck**

In [None]:
data.head(2)

In [None]:
def cabin_split(s):
    return (''.join(filter(str.isalpha, str(s))) or "unknown")

In [None]:
#Turning cabin into Deck
data["Deck"] = data["Cabin"].apply(cabin_split)

In [None]:
data.head(2)

In [None]:
data = one_hot_encoder_two(data, 'Deck')


#### **Ticket**

In [None]:
def ticket_number(x):
    if x == 'LINE':
        return 0
    return x.split(" ")[-1]

def ticket_item(x):
    items = x.split(" ")
    if len(items) == 1:
        return None
    return "_".join(items[0:-1])

In [None]:
print(ticket_item("A/5 21117"))
print(ticket_number("A/5 21117"))

In [None]:
data["Ticket_number"] = data["Ticket"].apply(ticket_number)
data["Ticket_item"] = data["Ticket"].apply(ticket_item)

Use one hot encoder to get rid of caterogical values on chosen columns

In [None]:
data = one_hot_encoder_two(data, 'Ticket_item')

In [None]:
data.head(2)

#### **Name**

In [None]:
data["Name"][:5]

For the "Name" field to be used in our model, it must be transformed. As you can see, each passenger has a title, let's separate it into a separate column. And I'll see how many there are


In [None]:

data['Prefix'] = data.Name.str.extract('([A-Za-z]+)\.',expand=False)
data.Prefix.unique()

Let's combine some of them together

In [None]:
data['Prefix'] = data['Prefix'].replace(['Lady','Capt','Col','Don','Dr','Major','Rev','Jonkheer','Dona'],'Rare')
data['Prefix'] = data['Prefix'].replace(['Countess','Lady','Sir'],'Royal')
data['Prefix'] = data['Prefix'].replace(['Mlle','Ms'],'Miss')
data['Prefix'] = data['Prefix'].replace(['Mme'],'Mrs')

In [None]:
print(data.Prefix.unique())

In [None]:
data.head(2)

In [None]:
data = one_hot_encoder_two(data, 'Prefix')

#### **Sex**

In [None]:
data = one_hot_encoder_two(data, 'Sex')

### prepare train & test data

#### train data


In [None]:
# prepare training data y
y = train_data["Survived"]
y

In [None]:
# Get the index where the train and test data were originally separated
train_data_index = len(train_data)
test_data_index = len(data) - len(test_data)

In [None]:
# Split the concatenated data back into train and test sets
X = data[:train_data_index].copy()
X_test = data[test_data_index:].copy()

In [None]:
X.head(2)

In [None]:
X_test.head(2)

In [None]:
X.drop(columns=["Survived", "Name", "Cabin", "Ticket", "Fare"], inplace=True)
X.head()

#### test data

In [None]:
X_test.head(2)

In [None]:
X_test.drop(columns=["Survived", "Name", "Cabin", "Ticket", "Fare"], inplace=True)
X_test.head()

# model

## Random Forest Clssifier

In [None]:
rand_forest = RandomForestClassifier(random_state=42, n_jobs=-1, max_depth=5,
                                       n_estimators=100, oob_score=True)

In [None]:
%%time
rand_forest.fit(X, y)

In [None]:
# checking the oob score
rand_forest.oob_score_

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200],
    'criterion': ['gini', 'entropy', 'log_loss']
}

In [None]:
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="accuracy")

In [None]:
%%time
grid_search.fit(X, y)


In [None]:
print(grid_search.best_score_)

In [None]:
rf_best = grid_search.best_estimator_
print(rf_best)

In [None]:

plt.figure(figsize=(80,40))
plot_tree(rf_best.estimators_[5], feature_names = X.columns,class_names=['Survived', 'no'],filled=True);

make predictions

In [None]:
model = RandomForestClassifier(max_depth=10,criterion='entropy', min_samples_leaf=5, n_estimators=25,
                       n_jobs=-1, random_state=42)
model.fit(X,y)

In [None]:
model.score(X,y)


In [None]:
predictions = model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_random_forest.csv', index=False)
print("Your submission was successfully saved!")