In [85]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [86]:
data={
    'PassengerId': [1, 2, 3, 4, 5, 6, 7, 8],
    'Pclass': [3, 1, 3, 1, 3, 1, 2, 3],
    'Sex': ['male', 'female', 'female', 'female', 'male', 'male', np.nan, 'male'],
    'Age': [22, 38, 26, 35, 35, np.nan, 54, 2],
    'Fare': [7.25, 71.28, 7.92, 53.1, 8.05, 8.45, np.nan, 21.07],
    'Embarked': ['S', 'C', 'S', 'S', 'S', 'Q', 'S', np.nan],
    'Survived': [0, 1, 1, 1, 0, 0, 0, 0]
}
df=pd.DataFrame(data)
df

Unnamed: 0,PassengerId,Pclass,Sex,Age,Fare,Embarked,Survived
0,1,3,male,22.0,7.25,S,0
1,2,1,female,38.0,71.28,C,1
2,3,3,female,26.0,7.92,S,1
3,4,1,female,35.0,53.1,S,1
4,5,3,male,35.0,8.05,S,0
5,6,1,male,,8.45,Q,0
6,7,2,,54.0,,S,0
7,8,3,male,2.0,21.07,,0


#EDA (Exploratory Data Analysis)

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  8 non-null      int64  
 1   Pclass       8 non-null      int64  
 2   Sex          7 non-null      object 
 3   Age          7 non-null      float64
 4   Fare         7 non-null      float64
 5   Embarked     7 non-null      object 
 6   Survived     8 non-null      int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 580.0+ bytes


In [88]:
df.describe()

Unnamed: 0,PassengerId,Pclass,Age,Fare,Survived
count,8.0,8.0,7.0,7.0,8.0
mean,4.5,2.125,30.285714,25.302857,0.375
std,2.44949,0.991031,16.090518,26.186018,0.517549
min,1.0,1.0,2.0,7.25,0.0
25%,2.75,1.0,24.0,7.985,0.0
50%,4.5,2.5,35.0,8.45,0.0
75%,6.25,3.0,36.5,37.085,1.0
max,8.0,3.0,54.0,71.28,1.0


In [89]:
df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Sex,1
Age,1
Fare,1
Embarked,1
Survived,0


In [90]:
x=df.drop(['PassengerId','Survived'],axis=1)
y=df['Survived']

#Handling Missing Values -

In [91]:
imputer_numaric=SimpleImputer(strategy='mean')
imputer_categorical=SimpleImputer(strategy='most_frequent')

In [92]:

numeric_cols=['Age','Fare']
categorical_cols=['Sex','Embarked']

In [93]:
x[numeric_cols] =imputer_numaric.fit_transform(x[numeric_cols] )
x[categorical_cols] =imputer_categorical.fit_transform(x[categorical_cols])

In [94]:
x.isnull().sum()

Unnamed: 0,0
Pclass,0
Sex,0
Age,0
Fare,0
Embarked,0


#Encoding


In [95]:
oe=OrdinalEncoder(categories=[['female','male']])
x['Sex']=oe.fit_transform(x[['Sex']])

In [96]:
ohe=OneHotEncoder(drop='first',sparse_output=False)
embarked_encoded=ohe.fit_transform(x[['Embarked']])

In [97]:
embarked_encoded

array([[0., 1.],
       [0., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [98]:
embarked_df = pd.DataFrame(embarked_encoded, columns=ohe.get_feature_names_out(['Embarked']))

In [99]:
embarked_df

Unnamed: 0,Embarked_Q,Embarked_S
0,0.0,1.0
1,0.0,0.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0
5,1.0,0.0
6,0.0,1.0
7,0.0,1.0


In [100]:
x=x.drop('Embarked',axis=1)
x=pd.concat([x,embarked_df],axis=1)

In [101]:
x

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_Q,Embarked_S
0,3,1.0,22.0,7.25,0.0,1.0
1,1,0.0,38.0,71.28,0.0,0.0
2,3,0.0,26.0,7.92,0.0,1.0
3,1,0.0,35.0,53.1,0.0,1.0
4,3,1.0,35.0,8.05,0.0,1.0
5,1,1.0,30.285714,8.45,1.0,0.0
6,2,1.0,54.0,25.302857,0.0,1.0
7,3,1.0,2.0,21.07,0.0,1.0


#Train-Test Split

In [102]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=42)

In [103]:
x_train.shape

(6, 6)

In [104]:
x_test.shape

(2, 6)

# Feature Scaling

In [105]:
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

In [106]:
X_train_final = pd.DataFrame(x_train_scaled, columns=x_train.columns)
X_test_final = pd.DataFrame(x_test_scaled, columns=x_test.columns)

In [107]:
X_train_final

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_Q,Embarked_S
0,0.654654,0.707107,-0.445099,-0.814659,0.0,0.0
1,0.654654,0.707107,-1.716811,0.038341,0.0,0.0
2,0.654654,-1.414214,-0.190757,-0.773305,0.0,0.0
3,0.654654,0.707107,0.381514,-0.765281,0.0,0.0
4,-1.963961,-1.414214,0.381514,2.015302,0.0,0.0
5,-0.654654,0.707107,1.58964,0.299602,0.0,0.0


In [108]:
X_test_final

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked_Q,Embarked_S
0,-1.963961,-1.414214,0.57227,3.137411,0.0,-1.0
1,-1.963961,0.707107,0.081753,-0.740592,1.0,-1.0
