In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split # to separe the test part from the train part
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer # to put sth in the empty places of the dataset with mean or median or...
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder # to do the preprocessing of the dataset
from sklearn.compose import ColumnTransformer

In [2]:
dataset=pd.read_csv("titanic_advanced.csv")
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 😊 We explore the dataset to learn more about it 

In [3]:
#Shape of the dataset
print("The shape of the dataset is :")
display(dataset.shape)
#The columns of the dataset
print("The columns of the dataset :")
display(dataset.columns)
#The type of the columns of the dataset 
print("The Type of columns of the dataset :")
display(dataset.dtypes)
#Some statistical information about the dataset
print(" Some statistical information about the dataset :")
display(dataset.describe(include="all"))
#The pourcentage of missing value in the columns of the dataset
print(" The pourcentage of missing value in the columns of the dataset:")
display(100*dataset.isnull().sum()/dataset.shape[0])

The shape of the dataset is :


(891, 12)

The columns of the dataset :


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

The Type of columns of the dataset :


PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

 Some statistical information about the dataset :


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


 The pourcentage of missing value in the columns of the dataset:


PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

### 😊Preprocessing with pandas

####  If we look at the columns of the dataset we see that we have some columns that should be dropped , as the column Name,PassegerId that are identifiers also we can see that the columns Ticket and Cabin have too much modalities so they should be dropped too .
#### But if we think wisely we can have information about the socio-economic background of the passenger thanks to the name column and we can also know if the passenger has a cabin number or not using the cabin column.

In [4]:
#We create a clumn HasCabin equal to 1
dataset["HasCabin"]=1
dataset.head()
#HasCabin should be equal to 0 when the cabin is missing 
dataset.loc[dataset["Cabin"].isnull(),"HasCabin"]=0
dataset.head()

#This part of the code checks if the cabin column in the dataset is null 'emtpy or missing value' , 
#the .loc method is used to access the rows that have NaN value in Cabin column and assign value 0 to the HasCabin
#rows that checks the first condition .




Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [7]:
#Extract the title from the person's name
def get_title(name):
    return name.split(', ')[1].split('.')[0]
#Splits the string where we have , and space then accesses to the next part of the string then split it where there is a .
#Create a new Column Title, containing the title of passenger name
dataset['Title'] = dataset['Name'].apply(get_title)
display(dataset.head())
dataset['Title'].value_counts()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,HasCabin,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0,Mr


Mr              517
Miss            182
Mrs             125
Master           40
Dr                7
Rev               6
Mlle              2
Major             2
Col               2
the Countess      1
Capt              1
Ms                1
Sir               1
Lady              1
Mme               1
Don               1
Jonkheer          1
Name: Title, dtype: int64

#### As you can see we have lots of passenger titles so we bring the smilar categoris under one name and create a new category for the rare titles

In [8]:
dataset['Title']=dataset['Title'].replace('Mlle','Miss')
dataset['Title']=dataset['Title'].replace('Ms','Miss')
dataset['Title']=dataset['Title'].replace('Mme','Mrs')
dataset['Title']=dataset['Title'].replace(['Lady', 'the Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'],'Rare')
dataset['Title'].value_counts()

Mr        517
Miss      185
Mrs       126
Master     40
Rare       23
Name: Title, dtype: int64

#### We are done with extracting information from the columns ,let's finish our preprocessing with pandas

In [9]:
useless_cols=['PassengerId','Name','Ticket','Cabin']
print("Dropping useless columns")
dataset=dataset.drop(useless_cols, axis=1)
dataset.head()

Dropping useless columns


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,Title
0,0,3,male,22.0,1,0,7.25,S,0,Mr
1,1,1,female,38.0,1,0,71.2833,C,1,Mrs
2,1,3,female,26.0,0,0,7.925,S,0,Miss
3,1,1,female,35.0,1,0,53.1,S,1,Mrs
4,0,3,male,35.0,0,0,8.05,S,0,Mr


In [10]:
print("We separate the target variable from the other variables:")
X= dataset.loc[:, dataset.columns != "Survived"]
display(X)
Y=dataset.loc[:, dataset.columns == "Survived"]
display(Y)


We separate the target variable from the other variables:


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,Title
0,3,male,22.0,1,0,7.2500,S,0,Mr
1,1,female,38.0,1,0,71.2833,C,1,Mrs
2,3,female,26.0,0,0,7.9250,S,0,Miss
3,1,female,35.0,1,0,53.1000,S,1,Mrs
4,3,male,35.0,0,0,8.0500,S,0,Mr
...,...,...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S,0,Rare
887,1,female,19.0,0,0,30.0000,S,1,Miss
888,3,female,,1,2,23.4500,S,0,Miss
889,1,male,26.0,0,0,30.0000,C,1,Mr


Unnamed: 0,Survived
0,0
1,1
2,1
3,1
4,0
...,...
886,0
887,1
888,0
889,1


### Preprocessing with sklearn called also scikit-learn 
###### Scikit-Learn, also known as sklearn is a python library to implement machine learning models and statistical modelling

In [11]:
print("Dividing into train and test sets")
X_train, X_test, y_train, y_test= train_test_split(X, Y, test_size=0.15, random_state=0)
print("Done^^")
display(X_train)
X_train.dtypes

Dividing into train and test sets
Done^^


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,HasCabin,Title
545,1,male,64.0,0,0,26.0000,S,0,Mr
37,3,male,21.0,0,0,8.0500,S,0,Mr
214,3,male,,1,0,7.7500,Q,0,Mr
40,3,female,40.0,1,0,9.4750,S,0,Mrs
236,2,male,44.0,1,0,26.0000,S,0,Mr
...,...,...,...,...,...,...,...,...,...
835,1,female,39.0,1,1,83.1583,C,1,Miss
192,3,female,19.0,1,0,7.8542,S,0,Miss
629,3,male,,0,0,7.7333,Q,0,Mr
559,3,female,36.0,1,0,17.4000,S,0,Mrs


Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Embarked     object
HasCabin      int64
Title        object
dtype: object

In [12]:
print("Partial Creation of the preprocessing pipeline:")

print("Step 1 : for numerical features:")
num_features=["Pclass","Age","SibSp","Parch","Fare"]
num_transformer=Pipeline(steps=
                        [("num_imputer",SimpleImputer(strategy="median")),
                         ("scaler",StandardScaler())
                        ])
print("Done^^")

print("Step 2 : for categorical features:")
cat_features=["Sex","Embarked",'HasCabin','Title']
cat_transformer=Pipeline(steps=
                         [("cat_imputer",SimpleImputer(strategy="constant", fill_value="Unknown")),
                          ("cat_encoder",OneHotEncoder(drop="first"))
                         ])
print("Done^^")

print("Step 3 : Creation of the final pipeline ")
preprocessor=ColumnTransformer(transformers=[
    ("cat_transformer", cat_transformer, cat_features),
    ("num_transformer", num_transformer, num_features)
    ])
print("Done^^")

Partial Creation of the preprocessing pipeline:
Step 1 : for numerical features:
Done^^
Step 2 : for categorical features:
Done^^
Step 3 : Creation of the final pipeline 
Done^^


In [13]:
print("Transforming X_train and X_test with the pipline above:")
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test) #takes a little cause we reuse the pipeline
display(X_train[:5])
display(X_test[:5])
print("Done^^")

Transforming X_train and X_test with the pipline above:


array([[ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        , -1.60067161,
         2.62354063, -0.46346837, -0.46599785, -0.10960455],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.81068841,
        -0.66498389, -0.46346837, -0.46599785, -0.47113394],
       [ 1.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.81068841,
        -0.05316537,  0.4315458 , -0.46599785, -0.47717621],
       [ 0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.81068841,
         0.78808508,  0.4315458 , -0.46599785, -0.44243314],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        , -0.3949916 ,
         1.09399434,  0.43

array([[ 1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.81068841,
        -0.05316537, -0.46346837, -0.46599785, -0.34206493],
       [ 1.        ,  0.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  0.81068841,
        -0.05316537, -0.46346837, -0.46599785, -0.4812044 ],
       [ 1.        ,  1.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.81068841,
        -1.73566628,  3.11658831,  0.78050523, -0.0466642 ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ,  0.        ,  1.        ,  0.        , -1.60067161,
        -0.05316537,  0.4315458 , -0.46599785,  2.31779438],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  0.81068841,
        -0.05316537, -0.46

Done^^


### We train the model

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score
model=LogisticRegression()

print("We train the model:")
model.fit(X_train,y_train)
print("Done^^")

print("Model predictions on the training set")
y_train_pred=model.predict(X_train)
y_train_pred[:5]

print("Predictions on Test set")
y_test_pred=model.predict(X_test)
y_test_pred[:5]

from sklearn.metrics import accuracy_score
print("Accuracy on train set:",accuracy_score(y_train,y_train_pred))
print("Accuracy on test set:",accuracy_score(y_test,y_test_pred))
print("--------------------The end this exercice--------------------------")

We train the model:


  y = column_or_1d(y, warn=True)


Done^^
Model predictions on the training set
Predictions on Test set
Accuracy on train set: 0.8282694848084544
Accuracy on test set: 0.8208955223880597
--------------------The end this exercice--------------------------
