# Building a model that predicts whether a passanger on Titanic survived or not

In [38]:
#Importing all the libraries
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import math
import sklearn
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import model_selection 
from sklearn.linear_model import LogisticRegression


# Importing the data set


In [None]:
data=pd.read_csv('tested.csv')
data.info()

# The tested data set has 418 entries and 12 features. I am listing out the features with short description

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data['Age'].fillna(data['Age'].mean(),inplace=True)
data

In [None]:
data.head(8)

In [None]:
data.columns.values

In [None]:
#lets find out the features which could contribute to a high survial rate
# 1. Age and Sex
data.groupby('Survived').size()

In [None]:
sns.countplot(x='Survived',data=data)
plt.xticks([0,1],['Not Survived','Survived'])
plt.show()

In [None]:
data.groupby('Sex').size()

In [None]:
sns.countplot(x='Sex',data=data)
plt.xticks([0,1],['Male','Female'])
plt.show()

In [None]:
data['Age'].hist()

# Pclass 

In [None]:
sns.barplot(x='Pclass',y='Survived',data=data)

In [None]:
grid=sns.FacetGrid(data,col='Survived',row='Pclass')
grid.map(plt.hist,'Age',alpha=0.5,bins=20)
grid.add_legend()

# SibSp and Parch
Finding out if someone is alone or not

In [None]:
data['relatives']=data['SibSp']+data['Parch']
data.loc[data['relatives']>0,'not_alone']=0
data.loc[data['relatives']==0,'not_alone']=1
data['not_alone']=data['not_alone'].astype(int)
data['not_alone'].value_counts()

In [None]:
axes=sns.pointplot(x='relatives',y='Survived', data=data)

# Data Preprocessing
Missing Data 
(Extracting the data and creating a new feature into numeric variable. The missing values will be converted to zero)

In [None]:
import re
deck={'A':1,"B":2,'C':3,'D':4,'E':5,'F':6,'G':7,'H':8}
data['Cabin']=data['Cabin'].fillna('U1')
data['Deck']=data['Cabin'].map(lambda x: re.compile("([a-zA-Z]+)").search(x).group())
data['Deck']=data['Deck'].map(deck)
data['Deck']=data['Deck'].fillna(0)
data['Deck']=data['Deck'].astype(int)

#Dropping the cabin feature
data=data.drop(['Cabin'],axis=1)
data


In [None]:
data.info()

# Converting Fare from float to int64

In [None]:
data['Fare']=data['Fare'].fillna(0)
data['Fare']=data['Fare'].astype(int)
data

# Extracting the names of the passengers and dropping them 

In [None]:
titles={'Mr':1,'Mrs':2,'Miss':3,'Master':4,'Rare':5}
data['Title']=data.Name.str.extract('([A-Za-z]+)\.',expand=False)
data['Title']=data['Title'].replace(['Lady','Countness','Capt','Col','Don','Dr','Major','Rev','Sir','Jonkheer','Dona'],'Rare')
data['Title']=data['Title'].replace('Mlle','Miss')
data['Title']=data['Title'].replace('Ms','Miss')
data['Title']=data['Title'].replace('Mme','Mrs')
data['Title']=data['Title'].map(titles)
#filling NaN with 0
data['Title']=data['Title'].fillna(0)
#drop the missing the values
data=data.drop(['Name'],axis=1)
data



# Converting the sex feature into numeric

In [None]:
gender={'male':0,'female':1}
data['Sex']=data['Sex'].map(gender)
data

In [None]:
data['Ticket'].describe()

In [None]:
#As 363 are unique tickets I will drop out the ticket category from the dataset
data=data.drop(['Ticket'],axis=1)
data

# Creating categories
For Age and Fare

In [None]:
#Converting the age feature to int from float
data['Age']=data['Age'].astype(int)
data.loc[data['Age']<=11,'Age']=0
data.loc[(data['Age']>11) & (data['Age']<=20),'Age']=1
data.loc[(data['Age']>20) & (data['Age']<=25),'Age']=2
data.loc[(data['Age']>25) & (data['Age']<=30),'Age']=3
data.loc[(data['Age']>30) & (data['Age']<=40),'Age']=4
data.loc[(data['Age']>40) & (data['Age']<=50),'Age']=5
data.loc[(data['Age']>50) & (data['Age']<=60),'Age']=6
data.loc[data['Age']>60,'Age']=7
data['Age'].value_counts()
                             

In [None]:
data.head(10)

# Creating new features
1. Fare per person
2. Class times ages of the passangers 

In [None]:
#1. Fare per person
data['FarePerPerson']=data['Fare']/(data['relatives']+1)
data['FarePerPerson']=data['FarePerPerson'].astype(int)
data

In [None]:
#2. Class times the ages of the passangers
data['AgeClass']=data['Age']*data['Pclass']
data

In [None]:
data=data.drop(['Embarked'],axis=1)
data

In [None]:
data=data.drop(['PassengerId'],axis=1)
data

# Modeling the dataset
I will training Logistic Regression Machine Learning models

In [None]:
x = data.drop('Survived', axis=1)
y = data['Survived']
x.head()

In [None]:
y.head()

In [None]:
x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.2,random_state=1)

In [None]:
model=LogisticRegression(solver='liblinear')
model.fit(x_train,y_train)

In [None]:
prediction=model.predict(x_test)

In [None]:
report=classification_report(y_test,prediction)
print(report)

In [None]:
print(confusion_matrix(y_test,prediction))

In [None]:
accuracy_score(y_test,prediction)