#Background
The sinking of the Titanic remains one of the most infamous shipwrecks in recent history, and there is the hypothesis that certain groups of people were more likely to survive.

As such, below we will apply mathematical models to try to predict those that will survive and those that will die.

#Preparing the Data

##Reading Data

In [None]:

import pandas as pd
import numpy as np
#Importing the data

train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")


##Viewing the Data

In [None]:
#Reading the shape of the training data
train_data.shape


In [None]:
#Viewing the first 5 rows of the training data
train_data.head()

In [None]:
#Viewing the columns in the training data
train_data.columns

#Processing the Data

##Dealing with missing values

In [None]:
#Are there null values?
train_data.isnull()

We see that we have several null values across the different rows though most are in the Cabin column.

In [None]:
#We can further check the number of null values
train_data['Cabin'].isnull().value_counts()

In [None]:
#Here, we will drop the Cabin column in the training data
train_data= train_data.drop('Cabin', axis=1)

train_data.head()

In [None]:
#Rechecking the shape of the training data
train_data.shape

In [None]:
#We can now drop the null values in the features matrix in the training data
train_data=train_data.dropna(axis=0)
train_data.shape

##Creating New Features

In [None]:
#Adding up the Family Members one had on board
train_data['Family_Members']= train_data['SibSp'] + train_data['Parch']
train_data.head()

In [None]:
#Creating the Alone column
train_data['Alone'] = (train_data['Family_Members']) == 0
train_data['Not_Alone'] = (train_data['Family_Members']) > 0
train_data.head()

In [None]:
# Define the age group bins
age_bins = [0, 18, 30, 50, float('inf')]
age_labels = ['Child', 'Young', 'Adult', 'Senior']

# Create a new column 'Age_Group' based on the age groups
train_data['Age_Group'] = pd.cut(train_data['Age'], bins=age_bins, labels=age_labels)
train_data.head()

In [None]:
#Arranging passengers into different classes by fare
#First we check the different fares that were paid
Fares=train_data['Fare'].unique()
print(np.sort(Fares))

In [None]:
# Define the fare bins
fare_bins = [0, 20, 50, 80, 100, 150, 200, float('inf')]

# Create a new column 'Age_Group' based on the age groups
train_data['Fare_Group'] = pd.cut(train_data['Fare'], bins=fare_bins)
train_data.head()

In [None]:
#Extracting the titles in people's names
import re

train_data['Title'] = train_data['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
Titles=train_data['Title'].unique()
Titles

##Arranging the Data into Features Matrix and Target Vector
A view of the data reveals that the Survived column is the target array.

Likewise, not all the other columns are essential in training a model thus the need to be selective.

In [None]:
#Creating the target vector
y=train_data['Survived']
#Viewing the target vector
y.head()

In [None]:
#Checking the number of unique elements in the target vector
y.unique()


We see that only have 0 or 1.

In [None]:
#Creating the features matrix in both the training data
X_features=['Title','Pclass', 'Sex', 'Age','Age_Group', 'SibSp',
       'Parch','Family_Members', 'Fare','Fare_Group','Embarked','Alone','Not_Alone']
X=train_data[X_features]
#Viewing the features matrix
X.head()

##One-hot encoding

In [None]:
#One-hot encoding the features matrix on Pclass
X_title=pd.get_dummies(X['Title'])

X_title=X_title.add_suffix('__title')
#Viewing the encoded data
X_title.head()

In [None]:
#Merging to the original data
X=X.join(X_title)

#Dropping the Pclass column
X=X.drop('Title', axis=1)
#Viewing the encoded data
X.shape

In [None]:
#One-hot encoding the features matrix on Pclass
X_pclass=pd.get_dummies(X['Pclass'], prefix='P_')

X_pclass=X_pclass.add_suffix('__class')
#Viewing the encoded data
X_pclass.head()

In [None]:
#Checking the shape of the encoded data
X_pclass.shape

In [None]:
#Merging to the original data
X=X.join(X_pclass)

#Dropping the Pclass column
X=X.drop('Pclass', axis=1)
#Viewing the encoded data
X.shape

In [None]:
#One-hot encoding the features matrix on Family_Members
X_family=pd.get_dummies(X['Family_Members'], prefix='Fam_')

X_family=X_family.add_suffix('__members')
#Viewing the encoded data
X_family.head()

In [None]:
#Merging to the original data
X=X.join(X_family)

#Dropping the Pclass column
X=X.drop('Family_Members', axis=1)
#Viewing the encoded data
X.shape

In [None]:
X.shape

In [None]:
#One-hot encoding the features matrix on Sex

X_sex=pd.get_dummies(X['Sex'])
X_sex=X_sex.add_suffix('__sex')

X_sex.head()

In [None]:
#Merging to the original data

X = X.join(X_sex)
#Dropping the Pclass column
X=X.drop('Sex', axis=1)
#Viewing the encoded data
X.shape


In [None]:
#One-hot encoding the features matrix on Embarked

X_embarked=pd.get_dummies(X['Embarked'])

X_embarked=X_embarked.add_suffix('__Embarked')
#Viewing the encoded data
X_embarked.head()

In [None]:
#Merging to the original data

X = X.join(X_embarked)
#Dropping the Pclass column
X=X.drop('Embarked', axis=1)
#Viewing the encoded data
X.shape

In [None]:
#One-hot encoding the features matrix on Age Group

X_age_group=pd.get_dummies(X['Age_Group'])

X_age_group=X_age_group.add_suffix('__Age_Group')
#Viewing the encoded data
X_age_group.head()

In [None]:
#Merging to the original data

X = X.join(X_age_group)
#Dropping the Pclass column
X=X.drop('Age_Group', axis=1)
#Viewing the encoded data
X.shape


In [None]:
#One-hot encoding the features matrix on Fare Group

X_fare_group=pd.get_dummies(X['Fare_Group'])

X_fare_group=X_fare_group.add_suffix('__Fare_Group')
#Viewing the encoded data
X_fare_group.head()

In [None]:
#Merging to the original data

X = X.join(X_fare_group)
#Dropping the Pclass column
X=X.drop('Fare_Group', axis=1)
#Viewing the encoded data
X.shape


In [None]:
X.head()

#Modeling the Data

In [None]:
from sklearn.model_selection import train_test_split
# We start by splitting our dataset
Xtrain, Xtest, ytrain, ytest=train_test_split(X,y, random_state=1, train_size=0.8)

##Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier #Choose model class
model = RandomForestClassifier(n_estimators=500, random_state=1) #Instantiate model
model.fit(Xtrain, ytrain) #Fit model

In [None]:
ypred=model.predict(Xtrain) #Predicting on the training data

In [None]:
# Assess model performnace
# We can then use the accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(ypred,ytrain)

In [None]:
ymodel=model.predict(Xtest) #Predicting on the test data

In [None]:
# Assess model performnace
# We can then use the accuracy score
from sklearn.metrics import accuracy_score
accuracy_score(ymodel,ytest)

In [None]:
# Using the confusion matrix
from sklearn.metrics import confusion_matrix
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

mat=confusion_matrix(ymodel,ytest)
sns.heatmap(mat, square=True, annot=True, cbar=False)
plt.xlabel('Predicted Value')
plt.ylabel('True Value');

##Viewing the Test Data

In [None]:
#Reading the shape of the test data
test_data.shape


In [None]:
#Viewing the first 5 rows of the test data
test_data.head()


In [None]:
#Viewing the columns in the test data
test_data.columns


Evidently, the test data only lacks the Survived column.

##Processing the Test Data

In [None]:
#We need to create the same columns as in the training set
#Adding up the Family Members one had on board
test_data['Family_Members']= test_data['SibSp'] + test_data['Parch']

#Creating the Alone column
test_data['Alone'] = (test_data['Family_Members']) == 0
test_data['Not_Alone'] = (test_data['Family_Members']) > 0

# Create a new column 'Age_Group' based on the age groups
test_data['Age_Group'] = pd.cut(test_data['Age'], bins=age_bins, labels=age_labels)

# Create a new column 'Age_Group' based on the age groups
test_data['Fare_Group'] = pd.cut(test_data['Fare'], bins=fare_bins)


In [None]:
#Extracting the titles in people's names
import re

test_data['Title'] = test_data['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
Test_Titles=test_data['Title'].unique()
Test_Titles

In [None]:
#We will include only the features included in the training data
X_test=test_data[X_features]
X_test.head()

In [None]:
#Checking the shape of X_test
X_test.shape

##One-hot encoding the test data

In [None]:
#One-hot encoding the features matrix on Pclass
X_test_title=pd.get_dummies(X_test['Title'])

X_test_title=X_test_title.add_suffix('__title')
#Viewing the encoded data
X_test_title.head()

In [None]:
#Merging to the original data
X_test=X_test.join(X_test_title)

#Dropping the Pclass column
X_test=X_test.drop('Title', axis=1)
#Viewing the encoded data
X_test.shape

In [None]:
#One-hot encoding the class column
X_test_pclass=pd.get_dummies(X_test['Pclass'], prefix='P_')
X_test_pclass=X_test_pclass.add_suffix('__class')

#Viewing the encoded data
X_test_pclass.head()

In [None]:
#Merging to the original data
X_test=X_test.join(X_test_pclass)

#Dropping the Pclass column
X_test=X_test.drop('Pclass', axis=1)
#Viewing the encoded data
X_test.shape

In [None]:
#One-hot encoding the features matrix on Family_Members
X_test_family=pd.get_dummies(X_test['Family_Members'], prefix='Fam_')

X_test_family=X_test_family.add_suffix('__members')
#Viewing the encoded data
X_test_family.head()

In [None]:
#Merging to the original data
X_test=X_test.join(X_test_family)

#Dropping the Pclass column
X_test=X_test.drop('Family_Members', axis=1)
#Viewing the encoded data
X_test.shape

In [None]:
X_test_sex=pd.get_dummies(X_test['Sex'])
X_test_sex=X_test_sex.add_suffix('__sex')

X_test_sex.head()

In [None]:
#Merging to the original data

X_test = X_test.join(X_test_sex)
#Dropping the Pclass column
X_test=X_test.drop('Sex', axis=1)
#Viewing the encoded data
X_test.shape

In [None]:
#One-hot encoding on Embarked

X_test_embarked=pd.get_dummies(X_test['Embarked'])

X_test_embarked=X_test_embarked.add_suffix('__Embarked')
#Viewing the encoded data
X_test_embarked.head()

In [None]:
#Merging

X_test=X_test.join(X_test_embarked)
#Dropping the Pclass column
X_test=X_test.drop('Embarked', axis=1)
#Viewing the encoded data
X_test.shape

In [None]:
#One-hot encoding the features matrix on Age Group

X_test_age_group=pd.get_dummies(X_test['Age_Group'])

X_test_age_group=X_test_age_group.add_suffix('__Age_Group')
#Viewing the encoded data
X_test_age_group.head()


In [None]:
#Merging to the original data

X_test = X_test.join(X_test_age_group)
#Dropping the Pclass column
X_test=X_test.drop('Age_Group', axis=1)
#Viewing the encoded data
X_test.shape

In [None]:
#One-hot encoding the features matrix on Fare Group

X_test_fare_group=pd.get_dummies(X_test['Fare_Group'])

X_test_fare_group=X_test_fare_group.add_suffix('__Fare_Group')
#Viewing the encoded data
X_test_fare_group.head()



In [None]:
#Merging to the original data

X_test = X_test.join(X_test_fare_group)
#Dropping the Pclass column
X_test=X_test.drop('Fare_Group', axis=1)
#Viewing the encoded data
X_test.shape

As there are missing titles in the test data that were in the training data, we can have those in the latter being included in the former

In [None]:
# Get the missing titles in X_test
missing_titles = set(Xtrain.columns) - set(X_test.columns)

missing_titles

In [None]:
# Add columns with zeros for the missing titles in X_test
for title in missing_titles:
    X_test[title] = 0

# Reorder the columns in X_test to match the order in X_train
X_test = X_test[Xtrain.columns]

In [None]:
X_test.head()

In [None]:
X_test.shape

##Applying The Model on the Test Data

In [None]:
#Dealing with missing values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(keep_empty_features=True)
X_test= imputer.fit_transform(X_test)

In [None]:
#Making the Prediction
y_test_predictions=model.predict(X_test)

In [None]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': y_test_predictions})
output.to_csv('submission.csv', index=False)