# Titanic Survival Prediction


### The notebook uses the following techniques- 
1) Replacing null values
<br>
2) Label Encoding
<br>
3) Standard Scaling
<br>
4) Train test split
<br>
5) GridSearchCV
<br>
6) RandomForestClassifier

In [1]:
#Importing libraries
import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['figure.figsize'] = (16, 6)
sns.set_theme(style="whitegrid")

In [2]:
#Reading data into workspace
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/titanic/train.csv'

In [None]:
#Printing the first five rows of the data
display(train_df.head())
display(test_df.head())

In [None]:
#Looking at the summary statistics of numerical data
display(train_df.describe())
display(test_df.describe())

In [None]:
#Looking at the summary statistics of character data
display(train_df.describe(include= ['O']))
display(test_df.describe(include= ['O']))

In [None]:
#Displaying the null data
display(train_df.isna().sum())
display(test_df.isna().sum())

In [None]:
#Replacing the null values of age with the median of age and gender 
train_df['Age'][train_df['Sex']=='female'] = train_df['Age'][train_df['Sex']=='female'].fillna(train_df['Age'][train_df['Sex']=='female'].median())
train_df['Age'][train_df['Sex']=='male'] = train_df['Age'][train_df['Sex']=='male'].fillna(train_df['Age'][train_df['Sex']=='male'].median())

test_df['Age'][test_df['Sex']=='female'] = test_df['Age'][test_df['Sex']=='female'].fillna(test_df['Age'][test_df['Sex']=='female'].median())
test_df['Age'][test_df['Sex']=='male'] = test_df['Age'][test_df['Sex']=='male'].fillna(test_df['Age'][test_df['Sex']=='male'].median())


In [None]:
#Replacing value the null values of embarked and fare
train_df['Embarked'].fillna(train_df['Embarked'].dropna().mode()[0], inplace= True)
test_df['Fare'].replace(np.nan, np.mean(test_df['Fare']), inplace= True)

In [None]:
#Looking at the percentage of survival accross categorical data using pointplot
g = sns.PairGrid(train_df, y_vars= 'Survived',
                 x_vars=['Pclass', 'Sex', 'Embarked', 'SibSp', 'Parch'],
                 height= 7, aspect= 0.4)

g.map(sns.pointplot, scale= 1, errwidth= 3, color= 'mediumseagreen')

In [None]:
#Combining test and train data to feature engineer to create family column which includes the Parch and SibSp columns
combined_df = [train_df, test_df]
for i in combined_df:
    i['family'] = i['Parch'] + i['SibSp'] + 1

In [None]:
#Creating is_alone column and displying the results using a pointplot
for i in combined_df:
    i['is_alone'] = 0
    i.loc[i['family'] == 1, 'is_alone'] = 1

g = sns.PairGrid(train_df, y_vars= 'Survived',
                 x_vars= 'is_alone',
                 height= 7, aspect= 0.7)

g.map(sns.pointplot, scale= 1, errwidth= 3, color= 'mediumseagreen')

In [None]:
#Extrating the title out of names to create the title column and displaying the results using a pointplot
for i in combined_df:
    i['title'] = i.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

for i in combined_df:
    i['title'] = i['title'].replace(['Jonkheer', 'Dona',
 	'Dr', 'Don', 'Major', 'Col', 'Sir', 'Capt', 'Rev'], 'CBD')
    
    i['title'] = i['title'].replace(['Mlle', 'Lady', 'Countess'], 'Miss')
    i['title'] = i['title'].replace('Ms', 'Miss')
    i['title'] = i['title'].replace('Mme', 'Mrs')

g = sns.PairGrid(train_df, y_vars= 'Survived',
                 x_vars= 'title',
                 height= 7, aspect= .7)
g.map(sns.pointplot, scale= 1, errwidth= 3, color= 'mediumseagreen')

In [None]:
#Coverting categorical values into labels using sklearn's LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_sex = LabelEncoder()
label_embark = LabelEncoder()
label_title = LabelEncoder()
label_title = LabelEncoder()

for i in combined_df:
    i['Sex'] = label_sex.fit_transform(i['Sex'])


for i in combined_df:
    i['Embarked'] = label_embark.fit_transform(i['Embarked'])


for i in combined_df:
    i['title'] = label_title.fit_transform(i['title'])

In [None]:
#Dropping columns which will not be used in building our model
passenger_id = test_df['PassengerId']
for i in combined_df:
    i.drop(['Name', 'PassengerId', 'Ticket', 'Cabin'], axis= 1, inplace= True)
train_df.head()

In [None]:
#Looking at the correlation of the data
corr = train_df.corr()
sns.heatmap(corr, cmap= 'Greens_r', annot = True)

In [None]:
#Standardising the Fare column using sklearn's StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_fare_train_df = train_df['Fare'].values.reshape(-1, 1)
fare_scaled_train_df = scaler.fit_transform(data_fare_train_df)
train_df['scaled_fare'] = fare_scaled_train_df

data_fare_test_df = test_df['Fare'].values.reshape(-1, 1)
fare_scaled_test_df = scaler.fit_transform(data_fare_test_df)
test_df['scaled_fare'] = fare_scaled_test_df
display(train_df.isna().sum())


In [None]:
#Dropping columns which have were used to feature engineer new columns
for i in combined_df:
    i.drop(['Fare', 'SibSp', 'Parch', 'family'], axis= 1, inplace= True)

In [None]:
#Standardising the Age column using sklearn's StandardScaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_age_train_df = train_df['Age'].values.reshape(-1, 1)
age_scaled_train_df = scaler.fit_transform(data_age_train_df)
train_df['Age'] = age_scaled_train_df

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_age_test_df = test_df['Age'].values.reshape(-1, 1)
age_scaled_test_df = scaler.fit_transform(data_age_test_df)
test_df['Age'] = age_scaled_test_df

In [None]:
#Splitting the data into target and predictor variables
X = train_df.drop('Survived', axis= 1)
Y = train_df['Survived'].values

In [None]:
#Splitting the data into train and test data
from sklearn.model_selection import train_test_split
x_train_df,x_test_df,y_train_df,y_test_df = train_test_split(X, Y,test_size= 0.3 , random_state= 2)

In [None]:
#Finding the right parameters for RandomForestClssifier using GridSearchCV and fitting the data
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

param_grid = {'n_estimators': [100, 200, 300, 400, 500],
             'min_samples_leaf': np.arange(0.1, 1)}
rf = RandomForestClassifier()
grid = GridSearchCV(estimator= rf, param_grid= param_grid, scoring= 'accuracy', cv= 10)

grid.fit(X, Y)

In [None]:
#Predicting the data and getting the accuracy score
pred = grid.predict(test_df)
gender_df = pd.read_csv('/kaggle/input/titanic/gender_submission.csv' , usecols= ['Survived'])
display(accuracy_score(gender_df, pred))
classification_report(gender_df, pred)

In [None]:
#Coverting the final prediction to csv to submit
sub = pd.DataFrame({
        'PassengerId': passenger_id ,
        'Survived': pred})
sub.to_csv('sub.csv', index= False)