# Titanic Kaggle competition 


## Objective 
---------
    Predicting survival passengers 

## Data
---------
    <b>Description</b> : The data has been split into two groups 
    - training set (train.csv) 
    - test set (test.csv)
    
    <b>shape </b> : 10 columns 
        survival	Survival	0 = No, 1 = Yes
        pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd
        sex	Sex	
        Age	Age in years	
        sibsp	number of siblings / spouses aboard the Titanic	
        parch	number of parents / children aboard the Titanic	
        ticket	Ticket number	
        fare	Passenger fare	
        cabin	Cabin number	
        embarked	Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)
    
## Model 
-----------
    What model I used 
    
    
## Results 
------------
    Results

## Import modules 

In [30]:
import numpy as np 
import pandas as pd 

## Import data 

In [31]:
data = pd.read_csv('Data/train.csv', sep=',')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [55]:
test_data = pd.read_csv('Data/test.csv', sep=',')
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Data exploration 

In [37]:
print('Number of passengers :', len(data), ' Missing values (total) : ', data.isnull().sum().sum())

Number of passengers : 891  Missing values (total) :  866


In [39]:
data[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch']].describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594
std,0.486592,0.836071,14.526497,1.102743,0.806057
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,38.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


In [40]:
data.groupby('Sex')['Survived'].mean()

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [41]:
data.groupby('SibSp')['Survived'].mean()

SibSp
0    0.345395
1    0.535885
2    0.464286
3    0.250000
4    0.166667
5    0.000000
8    0.000000
Name: Survived, dtype: float64

In [42]:
data.groupby('Parch')['Survived'].mean()

Parch
0    0.343658
1    0.550847
2    0.500000
3    0.600000
4    0.000000
5    0.200000
6    0.000000
Name: Survived, dtype: float64

In [43]:
data.pivot_table('Survived', index='Sex', columns='Pclass', aggfunc='mean')

Pclass,1,2,3
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


## Missing and aberrant values

In [16]:
# Missing values 
(data.isnull()).sum(axis=0)

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Feature selection & Feature engineering 

In [46]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Modelisation 

In [69]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Array generation for training the model
X = pd.get_dummies(data[['Pclass', 'Sex']]).values # Transforms the categorical variables into integer values
X_test = pd.get_dummies(test_data[['Pclass', 'Sex']]).values
y = data['Survived'].values

# Random Forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf = clf.fit(X, y)

# Score 
clf.score(X, y)

# Predict 
predictions = clf.predict(X_test)

# Saving results 
results = pd.DataFrame({'PassengerId' : test_data.PassengerId, 
             'Survived' : predictions })
results.to_csv('Data/my_submission.csv', index=False)


## To do :

- Further exploration : correlation between variables / Volume of missing values / aberrant values 
- Feature selection (what feature I need to keep for my model ? )
- Feature engineering ( what feature I need to construct to improve my model ? )
- Model selection (Why this model ? What tests have been done ? )