# Data Preprocessing by Using Sklearn (scikit learn)

In [132]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [133]:
# Load the dataset from the seaborn library 
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [134]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [135]:
# Handle the missing values from the 'age' column
# Impute the missing values with the mean of the 'age' column by using SimpleImputer

# load the SimpleImputer from the from sklean.impute module of the sklearn library
from sklearn.impute import SimpleImputer

# initialize the imputer
imputer = SimpleImputer(strategy = 'mean')

# fit the imputer on the 'age' column
titanic['age'] = imputer.fit_transform(titanic[['age']])
titanic['age'].isnull().sum()

np.int64(0)

In [136]:
titanic.drop("deck" , axis= 1 , inplace=True)

In [137]:
titanic["fare"].max()

np.float64(512.3292)

In [138]:
titanic['fare'].min()

np.float64(0.0)

In [139]:
titanic["fare"].value_counts()

fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
13.8583     1
50.4958     1
5.0000      1
9.8458      1
10.5167     1
Name: count, Length: 248, dtype: int64

In [140]:
shape_before = titanic.shape

# We see that the max value of the "fare" column is 512.3292 , which seems like a outlier 

so we use IQR (Inter Quartile Range) to calculate the outliers form this columns .

In [141]:
# Outlier Detection and Removal
# Detect and remove outliers in 'fare' based on the Interquartile Range (IQR)
Q1 = titanic['fare'].quantile(0.25)
Q3 = titanic['fare'].quantile(0.75)

IQR = Q3 - Q1
# calculate the lower and upper bounds for outliers
upper_bound = Q3 + IQR * 1.5
lower_bound = Q1 - IQR * 1.5

# remove outliers
titanic = titanic[(titanic['fare'] >= lower_bound) & (titanic['fare'] <= upper_bound)]

# check if any outliers were removed
shape_after = titanic.shape
total_outliers = shape_before[0] - shape_after[0]

print(f'Total number of outliers which were removed is :{total_outliers} ')

Total number of outliers which were removed is :116 


In [142]:
titanic['fare'].value_counts()

fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
13.8583     1
50.4958     1
5.0000      1
9.8458      1
10.5167     1
Name: count, Length: 203, dtype: int64

In [143]:
titanic['fare'].max()

np.float64(65.0)

In [144]:
titanic['fare'].min()

np.float64(0.0)

In [145]:
# Normalization
# Normalize 'fare' to have values between 0 and 1
# Import the modul of MinMaxScaler from sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler

# Creating the object of MinMaxScaler or the initialization
min_max_scaler = MinMaxScaler()

# fit_transefer the MinMaxScaler on the 'fare' column
titanic['fare_normalized'] = min_max_scaler.fit_transform(titanic[['fare']])

titanic['fare'].value_counts()


fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
13.8583     1
50.4958     1
5.0000      1
9.8458      1
10.5167     1
Name: count, Length: 203, dtype: int64

In [146]:
# Binning 
# Transform the 'age' column into three discrete categories

titanic['age_binned'] = pd.cut(titanic['age'], bins = [0, 18, 60, 100], labels= ['Child', 'Adult', 'Senior'])
titanic.sample(20) 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_normalized,age_binned
652,0,3,male,21.0,0,0,8.4333,S,Third,man,True,Southampton,no,True,0.129743,Adult
593,0,3,female,29.699118,0,2,7.75,Q,Third,woman,False,Queenstown,no,False,0.119231,Adult
636,0,3,male,32.0,0,0,7.925,S,Third,man,True,Southampton,no,True,0.121923,Adult
616,0,3,male,34.0,1,1,14.4,S,Third,man,True,Southampton,no,False,0.221538,Adult
217,0,2,male,42.0,1,0,27.0,S,Second,man,True,Southampton,no,False,0.415385,Adult
71,0,3,female,16.0,5,2,46.9,S,Third,woman,False,Southampton,no,False,0.721538,Child
343,0,2,male,25.0,0,0,13.0,S,Second,man,True,Southampton,no,True,0.2,Adult
735,0,3,male,28.5,0,0,16.1,S,Third,man,True,Southampton,no,True,0.247692,Adult
751,1,3,male,6.0,0,1,12.475,S,Third,child,False,Southampton,yes,False,0.191923,Child
298,1,1,male,29.699118,0,0,30.5,S,First,man,True,Southampton,yes,True,0.469231,Adult


In [147]:
# Feature Engineering
# Create a new feature 'family_size' from 'sibsp' and 'parch'
titanic["family_size"] = titanic['sibsp'] + titanic['parch'] +1
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_normalized,age_binned,family_size
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False,0.111538,Adult,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True,0.121923,Adult,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False,0.816923,Adult,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True,0.123846,Adult,1
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,True,Queenstown,no,True,0.130128,Adult,1


In [148]:
# Feature Selection
# Select the top 3 features that have the highest correlation with 'survived'
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare_normalized']]
y = titanic['survived']
selector = SelectKBest(score_func=chi2, k=3)
X_selected = selector.fit_transform(X, y)