# Titanic Model

In [539]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression 
from sklearn import preprocessing
from sklearn.model_selection import train_test_split


Problem Statement: Create a binary classification model to predict if a passenger would have survived th titanic disaster or not. 

We start by loading the dataset and making sure each column has the proper datatype.

In [540]:
data_tr = pd.read_csv("train.csv")
data_te = pd.read_csv("test.csv")
data_tr.info()
data_tr.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


It looks like the data has been loaded up properly. After getting some information in the data we can see that two values are missing from the "embarked" column, "Age" and "Cabin" are missing a lot of values as well. Lets address those missing values as well as getting rid of the columns that wont give us much information for our model. 

In [541]:
data_tr.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)
data_tr['Age'].interpolate(inplace=True)
data_tr.dropna()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,22.5,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


Now we need to make the columns with strings into quantafiable data. 

In [542]:
dummies = []
columns = ['Sex', 'Embarked']
for col in columns:
    dummies.append(pd.get_dummies(data_tr[col]))
data2 = pd.concat(dummies, axis=1)
data2 = data2.astype('float')
data2.head()

Unnamed: 0,female,male,C,Q,S
0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0


Now that we have the 2 datasets lets combine them and get rid of the columns we dont need anymore. 

In [543]:
data_tr = pd.concat([data_tr,data2], axis=1)
data_tr.drop(['Sex', 'Embarked'], axis=1, inplace=True)
data_tr = data_tr.astype('float')
data_tr.set_index('PassengerId')


Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1.0,0.0,3.0,22.0,1.0,0.0,7.2500,0.0,1.0,0.0,0.0,1.0
2.0,1.0,1.0,38.0,1.0,0.0,71.2833,1.0,0.0,1.0,0.0,0.0
3.0,1.0,3.0,26.0,0.0,0.0,7.9250,1.0,0.0,0.0,0.0,1.0
4.0,1.0,1.0,35.0,1.0,0.0,53.1000,1.0,0.0,0.0,0.0,1.0
5.0,0.0,3.0,35.0,0.0,0.0,8.0500,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
887.0,0.0,2.0,27.0,0.0,0.0,13.0000,0.0,1.0,0.0,0.0,1.0
888.0,1.0,1.0,19.0,0.0,0.0,30.0000,1.0,0.0,0.0,0.0,1.0
889.0,0.0,3.0,22.5,1.0,2.0,23.4500,1.0,0.0,0.0,0.0,1.0
890.0,1.0,1.0,26.0,0.0,0.0,30.0000,0.0,1.0,1.0,0.0,0.0


In [544]:
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Survived     891 non-null    float64
 2   Pclass       891 non-null    float64
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    float64
 5   Parch        891 non-null    float64
 6   Fare         891 non-null    float64
 7   female       891 non-null    float64
 8   male         891 non-null    float64
 9   C            891 non-null    float64
 10  Q            891 non-null    float64
 11  S            891 non-null    float64
dtypes: float64(12)
memory usage: 83.7 KB


In [545]:
data_te.drop(['Name','Ticket','Cabin'], axis=1, inplace=True)
data_te['Age'].fillna(data_te['Age'].mean(), inplace=True)

dummies2 = []
columns2 = ['Sex', 'Embarked']
for col in columns2:
    dummies2.append(pd.get_dummies(data_te[col]))
data3 = pd.concat(dummies2, axis=1)
data3 = data3.astype('float')

data_te = pd.concat([data_te,data3], axis=1)
data_te.drop(['Sex', 'Embarked'], axis=1, inplace=True)
data_te = data_te.astype('float')
data_te.set_index('PassengerId')


Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,female,male,C,Q,S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892.0,3.0,34.50000,0.0,0.0,7.8292,0.0,1.0,0.0,1.0,0.0
893.0,3.0,47.00000,1.0,0.0,7.0000,1.0,0.0,0.0,0.0,1.0
894.0,2.0,62.00000,0.0,0.0,9.6875,0.0,1.0,0.0,1.0,0.0
895.0,3.0,27.00000,0.0,0.0,8.6625,0.0,1.0,0.0,0.0,1.0
896.0,3.0,22.00000,1.0,1.0,12.2875,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
1305.0,3.0,30.27259,0.0,0.0,8.0500,0.0,1.0,0.0,0.0,1.0
1306.0,1.0,39.00000,0.0,0.0,108.9000,1.0,0.0,1.0,0.0,0.0
1307.0,3.0,38.50000,0.0,0.0,7.2500,0.0,1.0,0.0,0.0,1.0
1308.0,3.0,30.27259,0.0,0.0,8.0500,0.0,1.0,0.0,0.0,1.0


In [546]:
y_train = data_tr['Survived']
data_tr.drop(["Survived"], axis=1, inplace=True)

x_train = data_tr
x_strain, x_stest, y_strain, y_stest = train_test_split(x_train, y_train, test_size=0.22, random_state = 1)
x_scale = preprocessing.scale(x_strain)
x_2scale = preprocessing.scale(x_stest)



In [547]:
lr = LogisticRegression()
lr.fit(x_scale,y_strain)
lr.score(x_2scale, y_stest)


0.7868020304568528