# Problem statement

> Here, using the data provided about passengers of Titanic, we've to predicit whether the passenger was survived the accident(titanic crash) or not.

# Data

> Data for this prediction model(Classification) is taken from the kaggle "Titanic - Machine Learning for Disaster" competition dataset : https://www.kaggle.com/c/titanic/data


In [11]:
## Let's import the required tools

#Regular tools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ML models(scikit-learn)
from sklearn.ensemble import RandomForestClassifier


# Model evaluation tools
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [12]:
df = pd.read_csv('../input/titanic/train.csv')
df

In [13]:
df.info()

In [14]:
## Let's analyze the target values
df['Survived'].value_counts()

In [15]:
df['Survived'].value_counts().plot(kind='bar', color=['salmon', 'cyan']);

In [16]:
df.describe()

In [17]:
df['Sex'].value_counts() 

In [18]:
### Let's compare sex column with target
pd.crosstab(df.Sex, df.Survived)

In [19]:
## Let's create a plot for the crosstab
pd.crosstab(df.Survived, df.Sex).plot(kind='bar', color=['salmon', 'lightblue'], figsize = (10,6))

plt.title("Survival frequency for Sex")
plt.xlabel("0 = Not survived, 1 = survived")
plt.ylabel("Count of survival")
plt.legend(['Female', 'Male']);

In [20]:
### Let's compare Pclass column with target
pd.crosstab(df.Pclass, df.Survived)

In [21]:
## Let's create a plot for the crosstab
pd.crosstab(df.Survived, df.Pclass).plot(kind='bar', color=['salmon', 'lightblue', 'cyan'], figsize = (10,6))

plt.title("Survival frequency for Pclass")
plt.xlabel("0 = Not survived, 1 = survived")
plt.ylabel("Count of survival")
plt.legend(['Pclass-1','Pclass-2','Pclass-3']);

In [22]:
## Now let's see the overall relations in the dataset (Correlation matrix)

df_corr = df.corr()
df_corr

In [23]:
## Let's make the correlation matrix more visual
fig, ax = plt.subplots(figsize=(15,10))

ax = sns.heatmap(df_corr,
                 annot=True,
                 linewidths=0.5,
                 fmt='.2f',
                 cmap='YlGnBu')

#### From the above Correlation heatmap, we can analyze which feature plays a vital role in chance of survivalance

In [24]:
## Let's dela with the missing values

df.isna().sum()

## Let's fill the empty values

In [25]:
## Let's consider the columns with numeric values

for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        print(label)

In [26]:
## Let's check if any numeric column has missing values
for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [27]:
## We have only one numeric column (Age) with null values, let's fill it with median values

for label, content in df.items():
    if pd.api.types.is_numeric_dtype(content):
        
        ## let's take a column to display whether it had null or not(previously)
        df[label+'_is_missing'] = pd.isnull(content)
        
        if pd.isnull(content).sum():
            
             
            
            ## Let's fill missing with median (b'coz median is more robust than mean)
            df[label] = content.fillna(content.median())

In [28]:
## Let's see about missing values

df.isna().sum()

In [29]:
## Let's find the columns with null values (string dtype)
for label, content in df.items():
    if pd.api.types.is_string_dtype(content):
        if pd.isnull(content).sum():
            print(label)

In [30]:
df['Cabin'].value_counts()

In [31]:
df['Embarked'].value_counts()

In [32]:
## Let's fill/change categorical values

for label, content in df.items():
    if not pd.api.types.is_numeric_dtype(content):
            
        ## Let's place a column to determine whether the values were missing
        df[label+'_is_missing'] = pd.isnull(content)
        
        ## Let's turn categorical values into numericals 
        ## we have to add +1 to the categorical code as for null it gives -1 ( so, we make it 0)
        
        df[label] = pd.Categorical(content).codes + 1

In [33]:
df.info()

In [34]:
df.isna().sum()

In [35]:
df = df.drop('Survived_is_missing', axis=1)

### Now we got a well cleaned dataset

In [36]:
#3 Let's split into x and y

x = df.drop('Survived', axis=1)
y = df.Survived

## Let's split the data into train and valid sets
np.random.seed(25)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

In [37]:
## let's try with RandomForestClassifier

clf = RandomForestClassifier()

In [38]:
clf.fit(x_train, y_train)

In [39]:
clf.score(x_train, y_train)

In [40]:
## let's check accuracy for the validation set

clf.score(x_test, y_test)

## Let's Tune the model to gain better results

In [41]:
## Hyperparameters dictionary

rfc_h = {'n_estimators': np.arange(5, 101, 5),
         'max_depth': [None, 3, 5, 7],
         'min_samples_split': np.arange(2, 20, 2)}

In [42]:
## Let's use Grid Search

rf_gs = GridSearchCV(RandomForestClassifier(n_jobs=-1),
                     param_grid = rfc_h,
                     cv=5)

In [43]:
%%time
rf_gs.fit(x_train, y_train)

In [44]:
rf_gs.best_params_

In [45]:
rf_gs.score(x_test, y_test)

## Let's evaluate our model

In [46]:
y_preds = rf_gs.predict(x_test)

In [47]:
y_preds

In [48]:
# Plot ROC curve and claculate AUC metric
plot_roc_curve(rf_gs, x_test, y_test);

In [49]:
# Confusion matrix
print(confusion_matrix(y_test, y_preds))

In [50]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_test, y_preds):
    """
    Plots a nice looking confusion matrix using Seaborn's heatmap()
    """
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot = True,
                     cbar = False)
    plt.xlabel("Predicted label") # Predictions go to the x-axis
    plt.ylabel("True label") # true labels go to the y-axis
    
plot_conf_mat(y_test, y_preds)

In [51]:
## Let's analyze different metrics with "Classification Report"

print(classification_report(y_test, y_preds))

## Now let's take test data

In [52]:
dft = pd.read_csv('../input/titanic/test.csv')

In [53]:
dft

### Let's perform all the methods, that we used for train set to clean the test dataset

In [54]:
dft.info()

In [55]:
dft.isna().sum()

In [56]:
## We have only one numeric column (Age) with null values, let's fill it with median values

for label, content in dft.items():
    if pd.api.types.is_numeric_dtype(content):
        
        ## let's take a column to display whether it had null or not(previously)
        
        dft[label+'_is_missing'] = pd.isnull(content)
        
        if pd.isnull(content).sum():
            
            ## Let's fill missing with median (b'coz median is more robust than mean)
            dft[label] = content.fillna(content.median())

In [57]:
dft.info()

In [58]:
## Let's fill/change categorical values

for label, content in dft.items():
    if not pd.api.types.is_numeric_dtype(content):
            
        ## Let's place a column to determine whether the values were missing
        
        dft[label+'_is_missing'] = pd.isnull(content)
        
        ## Let's turn categorical values into numericals 
        ## we have to add +1 to the categorical code as for null it gives -1 ( so, we make it 0)
        
        dft[label] = pd.Categorical(content).codes + 1

#### Here, even we don't have any null values in dft['Embarked'], we need to add "Embarked_is_missing" to make the features of test set same as that of the train set

In [59]:
dft.info()

## Let's make predictions for the test set

In [60]:
test_preds = rf_gs.predict(dft)

In [61]:
test_preds

In [62]:
Prediction = pd.DataFrame()

In [63]:
Prediction['PassengerId'] = dft['PassengerId']
Prediction['Survived'] = test_preds

In [64]:
Prediction