# Data Introduction

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('Titanic.csv')

In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Understand the Features

###### PassengerId: A unique index for each passenger.
###### Survived: Shows if the passenger survived or not (1 = survived, 0 = did not survive).
###### Pclass: Passenger Class (1 = First class ticket, 2 = Second class ticket, 3 = Third class ticket).
###### Name: The passenger's name, which also contains the title (Mr., Mrs., Doctor, et cetera).
###### Sex: The passenger's gender (male or female).
###### Age: The passenger's age (or if the age was not recorded, Nan).
###### SibSp: The number of siblings and spouses traveling with the passenger.
###### Parch: The number of parents and children traveling with the passenger.
###### Ticket: The passenger's ticket number.
###### Fare: The price of the passenger's ticket.
###### Cabin: The passenger's cabin number (or if the cabin number was not recorded, Nan).
###### Embarked: The port where the passenger boarded (C = Cherbourg, Q = Queenstown, S = Southampton).

In [4]:
data.shape

(1309, 12)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     1309 non-null   int64  
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(2), int64(5), object(5)
memory usage: 122.8+ KB


In [6]:
data.isnull().sum()

PassengerId       0
Survived          0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [7]:
data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.377387,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.484918,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [8]:
features = data.drop(['Survived'], axis = 1)
target = data['Survived']

In [9]:
features.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# Data Cleaning

In [11]:
features = features.drop(features[['Name', 'Ticket']], axis = 1)

In [12]:
features.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.25,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.925,,S
3,4,1,female,35.0,1,0,53.1,C123,S
4,5,3,male,35.0,0,0,8.05,,S


In [13]:
features.isnull().sum()

PassengerId       0
Pclass            0
Sex               0
Age             263
SibSp             0
Parch             0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [14]:
features['Age'] = features['Age'].fillna(features['Age'].median())
features['Fare'] = features['Fare'].fillna(features['Fare'].median())

In [15]:
features.isnull().sum()

PassengerId       0
Pclass            0
Sex               0
Age               0
SibSp             0
Parch             0
Fare              0
Cabin          1014
Embarked          2
dtype: int64

In [16]:
features = features.drop(features[['Cabin']], axis = 1)

In [17]:
features.isnull().sum()

PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
dtype: int64

In [18]:
the_list = []
the_list.append(features[features['Embarked'].isnull()].index)
features = features.drop(features[features['Embarked'].isna()].index)

for x in the_list:
    target = target.drop(x);
    
target.reset_index(inplace = True, drop = True)

In [19]:
features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1307 entries, 0 to 1308
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1307 non-null   int64  
 1   Pclass       1307 non-null   int64  
 2   Sex          1307 non-null   object 
 3   Age          1307 non-null   float64
 4   SibSp        1307 non-null   int64  
 5   Parch        1307 non-null   int64  
 6   Fare         1307 non-null   float64
 7   Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(2)
memory usage: 91.9+ KB


In [20]:
features.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [21]:
features['Sex'].unique()

array(['male', 'female'], dtype=object)

In [22]:
features['Embarked'].unique()

array(['S', 'C', 'Q'], dtype=object)

In [23]:
features['Sex'] = features['Sex'].map({'male':0, 'female':1})
features['Embarked'] = features['Embarked'].map({'C':0, 'Q':1, 'S':2})

In [24]:
features.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,0,22.0,1,0,7.25,2
1,2,1,1,38.0,1,0,71.2833,0
2,3,3,1,26.0,0,0,7.925,2
3,4,1,1,35.0,1,0,53.1,2
4,5,3,0,35.0,0,0,8.05,2


# Split the Data

In [25]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(features, target, train_size = 0.8, random_state = 42)

# Select and Train the Model

In [26]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [28]:
model.fit(x_train, y_train)

LogisticRegression()

# Evaluate the Model

In [29]:
prediction = model.predict(x_test)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       166
           1       0.90      0.73      0.80        96

    accuracy                           0.87       262
   macro avg       0.88      0.84      0.85       262
weighted avg       0.87      0.87      0.87       262



# Adjust the Hyperparameters

In [31]:
x_train, x_test, y_train, y_test = train_test_split(features, target, train_size = 0.88, random_state = 3)
model.fit(x_train, y_train)
prediction = model.predict(x_test)
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       105
           1       0.82      0.81      0.82        52

    accuracy                           0.88       157
   macro avg       0.86      0.86      0.86       157
weighted avg       0.88      0.88      0.88       157



# Save the Model

In [32]:
import pickle

In [33]:
with open('Titanic.pkl', 'wb') as file:
    pickle.dump(model, file)