# Data Preprocessing by Using Sklearn (scikit learn)

In [43]:
import seaborn as sns
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [44]:
# Load the dataset from the seaborn library 
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [45]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [46]:
# Handle the missing values from the 'age' column
# Impute the missing values with the mean of the 'age' column by using SimpleImputer

# load the SimpleImputer from the from sklean.impute module of the sklearn library
from sklearn.impute import SimpleImputer

# initialize the imputer
imputer = SimpleImputer(strategy = 'mean')

# fit the imputer on the 'age' column
titanic['age'] = imputer.fit_transform(titanic[['age']])
titanic['age'].isnull().sum()

np.int64(0)

In [47]:
titanic.drop("deck" , axis= 1 , inplace=True)

In [48]:
titanic["fare"].max()

np.float64(512.3292)

In [49]:
titanic['fare'].min()

np.float64(0.0)

In [50]:
titanic["fare"].value_counts()

fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
13.8583     1
50.4958     1
5.0000      1
9.8458      1
10.5167     1
Name: count, Length: 248, dtype: int64

In [51]:
shape_before = titanic.shape

# We see that the max value of the "fare" column is 512.3292 , which seems like a outlier 

so we use IQR (Inter Quartile Range) to calculate the outliers form this columns .

In [52]:
# Outlier Detection and Removal
# Detect and remove outliers in 'fare' based on the Interquartile Range (IQR)
Q1 = titanic['fare'].quantile(0.25)
Q3 = titanic['fare'].quantile(0.75)

IQR = Q3 - Q1
# calculate the lower and upper bounds for outliers
upper_bound = Q3 + IQR * 1.5
lower_bound = Q1 - IQR * 1.5

# remove outliers
titanic = titanic[(titanic['fare'] >= lower_bound) & (titanic['fare'] <= upper_bound)]

# check if any outliers were removed
shape_after = titanic.shape
total_outliers = shape_before[0] - shape_after[0]

print(f'Total number of outliers which were removed is :{total_outliers} ')

Total number of outliers which were removed is :116 


In [53]:
titanic['fare'].value_counts()

fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
13.8583     1
50.4958     1
5.0000      1
9.8458      1
10.5167     1
Name: count, Length: 203, dtype: int64

In [54]:
titanic['fare'].max()

np.float64(65.0)

In [55]:
titanic['fare'].min()

np.float64(0.0)

In [56]:
# Normalization
# Normalize 'fare' to have values between 0 and 1
# Import the modul of MinMaxScaler from sklearn.preprocessing
from sklearn.preprocessing import MinMaxScaler

# Creating the object of MinMaxScaler or the initialization
min_max_scaler = MinMaxScaler()

# fit_transefer the MinMaxScaler on the 'fare' column
titanic['fare_normalized'] = min_max_scaler.fit_transform(titanic[['fare']])

titanic['fare'].value_counts()


fare
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
13.8583     1
50.4958     1
5.0000      1
9.8458      1
10.5167     1
Name: count, Length: 203, dtype: int64

In [57]:
# Standardization
# Standardize 'age' to have a mean of 0 and a standard deviation of 1
scaler_std = StandardScaler()
titanic['age_standardized'] = scaler_std.fit_transform(titanic[['age']])

In [58]:
# Binning 
# Transform the 'age' column into three discrete categories

titanic['age_binned'] = pd.cut(titanic['age'], bins = [0, 18, 60, 100], labels= ['Child', 'Adult', 'Senior'])
titanic.sample(20) 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_normalized,age_standardized,age_binned
643,1,3,male,29.699118,0,0,56.4958,S,Third,man,True,Southampton,yes,True,0.869166,0.046606,Adult
157,0,3,male,30.0,0,0,8.05,S,Third,man,True,Southampton,no,True,0.123846,0.070164,Adult
398,0,2,male,23.0,0,0,10.5,S,Second,man,True,Southampton,no,True,0.161538,-0.477921,Adult
607,1,1,male,27.0,0,0,30.5,S,First,man,True,Southampton,yes,True,0.469231,-0.164729,Adult
616,0,3,male,34.0,1,1,14.4,S,Third,man,True,Southampton,no,False,0.221538,0.383356,Adult
556,1,1,female,48.0,1,0,39.6,C,First,woman,False,Cherbourg,yes,False,0.609231,1.479526,Adult
49,0,3,female,18.0,1,0,17.8,S,Third,woman,False,Southampton,no,False,0.273846,-0.86941,Child
810,0,3,male,26.0,0,0,7.8875,S,Third,man,True,Southampton,no,True,0.121346,-0.243027,Adult
751,1,3,male,6.0,0,1,12.475,S,Third,child,False,Southampton,yes,False,0.191923,-1.808984,Child
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,Southampton,no,True,0.0,1.557824,Adult


In [59]:
# Feature Engineering
# Create a new feature 'family_size' from 'sibsp' and 'parch'
titanic["family_size"] = titanic['sibsp'] + titanic['parch'] +1
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,embark_town,alive,alone,fare_normalized,age_standardized,age_binned,family_size
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,Southampton,no,False,0.111538,-0.556219,Adult,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,Southampton,yes,True,0.121923,-0.243027,Adult,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,Southampton,yes,False,0.816923,0.461654,Adult,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,Southampton,no,True,0.123846,0.461654,Adult,1
5,0,3,male,29.699118,0,0,8.4583,Q,Third,man,True,Queenstown,no,True,0.130128,0.046606,Adult,1


In [60]:
# Feature Selection
# Select the top 3 features that have the highest correlation with 'survived'
X = titanic[['pclass', 'age', 'sibsp', 'parch', 'fare_normalized']]
y = titanic['survived']
selector = SelectKBest(score_func=chi2, k=3)
X_selected = selector.fit_transform(X, y)

In [61]:
# Encoding Categorical Variables
# Convert 'sex' into a numerical format using Label Encoding
label_encoder = LabelEncoder()
titanic['sex_encoded'] = label_encoder.fit_transform(titanic['sex'])
titanic['sex_encoded'].value_counts()

sex_encoded
1    531
0    244
Name: count, dtype: int64

In [62]:
# Convert 'embarked' into binary columns using One-Hot Encoding
one_hot_encoder = OneHotEncoder()
encoded_embarked = one_hot_encoder.fit_transform(titanic[['embarked']]).toarray()
embarked_columns = one_hot_encoder.get_feature_names_out(['embarked'])
titanic = titanic.join(pd.DataFrame(encoded_embarked, columns=embarked_columns))

In [63]:
# Data Splitting
# Split the data into training and testing sets
X = titanic[['pclass', 'sex_encoded', 'age_standardized', 'sibsp', 'parch', 'fare_normalized', 'family_size']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)