# Data Cleaning and Preprocessing Pipeline
- Load Dataset
- Check for missing values (Data Imputation)
- Encode the Categorical data
- Feature scaling

In [108]:
import numpy as np
import pandas as pd

# Load Dataset

In [109]:
df=pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [110]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### You can draw correlation matrix and then drop certain features or directly drop some

In [111]:
columns_to_drop=['Cabin','Name','PassengerId','Ticket','Embarked']
# we know from intuition that these columns won't be contributing towards survival of a person
# which is the dataset about so remove these columns
df=df.drop(columns_to_drop,axis=1)
df.head(n=10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05
5,0,3,male,,0,0,8.4583
6,0,1,male,54.0,0,0,51.8625
7,0,3,male,2.0,3,1,21.075
8,1,3,female,27.0,0,2,11.1333
9,1,2,female,14.0,1,0,30.0708


# Check for missing values (Data Imputation)
- Delete a particular row if it has a null value for a particular feature.
- Delete a particular column if it has more than 75% of missing values. This method is advised only when there are enough samples in the data set.
- In the second method, we replace all the NaN values with either mean, median or most frequent value.Replacing with the above three approximations are a statistical approach to handling the missing values. This method is also called as leaking the data while training.

In [112]:
#removing columns with more than 75% missing values
null_values_per_col = np.sum(df.isnull(), axis=0)
print(type(null_values_per_col),null_values_per_col)
max_NAN = int(2*df.shape[0]/3.0)
columns_to_drop=[]
columns=df.columns.to_list()
for col in columns:
    if null_values_per_col[col]>max_NAN:
        columns_to_drop.append(col)
df=df.drop(columns_to_drop,axis=1)

<class 'pandas.core.series.Series'> Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64


In [113]:
#replacing missing values with most frequent values
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
# mean,median can't be used for non-numeric data

In [114]:
SimpleImputer?

In [115]:
imputer =imputer.fit(df)
df=pd.DataFrame(imputer.transform(df),columns=df.columns.to_list()) #return numpy array 
df.head(n=20)
# print(X[:5,:])

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22,1,0,7.25
1,1,1,female,38,1,0,71.2833
2,1,3,female,26,0,0,7.925
3,1,1,female,35,1,0,53.1
4,0,3,male,35,0,0,8.05
5,0,3,male,24,0,0,8.4583
6,0,1,male,54,0,0,51.8625
7,0,3,male,2,3,1,21.075
8,1,3,female,27,0,2,11.1333
9,1,2,female,14,1,0,30.0708


# Encode the Categorical data
Categorical data are variables that contain label values rather than numeric values. The number of possible values is often limited to a fixed set.
Some examples include:
- A “pet” variable with the values: “dog” and “cat”.
- A “color” variable with the values: “red”, “green” and “blue”.
- A “place” variable with the values: “first”, “second” and “third”.
- Label encoding is simply converting each value in a column to a number.

In [117]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in df.columns.to_list():
    if(df[col].dtype == "object"):
        df[col] = le.fit_transform(df[col])

# Feature scaling
Feature scaling or data normalization is a method used to normalize the range of independent variables or features of data. So when the values vary a lot in an independent variable, we use feature scaling so that all the values remain in the comparable range.
- It leads to faster convergence.

In [118]:
X=df.values[:,1:] # X - matrix of independent variables
Y=df.values[:,0]  # Y - matrix of dependent variable

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)