In [1]:
import numpy as np 
import seaborn as sns
import pandas as pd 
import matplotlib.pyplot as plt 
import plotly.express as px

In [2]:
df = sns.load_dataset("titanic")

df.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [4]:
df.isnull().sum() / len(df) * 100

survived        0.000000
pclass          0.000000
sex             0.000000
age            19.865320
sibsp           0.000000
parch           0.000000
fare            0.000000
embarked        0.224467
class           0.000000
who             0.000000
adult_male      0.000000
deck           77.216611
embark_town     0.224467
alive           0.000000
alone           0.000000
dtype: float64

usually when the % is higher than 50 we eliminate the column

## Remove columns 

In [5]:


df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

##  paste all the columns and eliminate deck 

In [6]:


# df= df[['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
#        'embarked', 'class', 'who', 'adult_male', 'embark_town',
#        'alive', 'alone']]

# df.columns

In [7]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

## drop column 

In [8]:


df.drop(columns="deck", inplace=True )

df.columns


Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')

In [9]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
embark_town      2
alive            0
alone            0
dtype: int64

## fill nulls with mode

In [10]:


df["embarked"].fillna(df["embarked"].mode()[0], inplace= True)
df["embark_town"].fillna(df["embark_town"].mode()[0], inplace= True)

## fill nulls with mean or median 

In [11]:


df["age"].fillna(df["age"].mean(), inplace = True)

# df["age"].fillna(df["age"].median(), inplace = True)





In [11]:
print(df["age"].mean())

29.69911764705882


## replace Null with a constant value 

In [12]:


df["age"].fillna(value = 29.69, inplace = True )


In [14]:
df.isnull().sum()

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
embark_town    0
alive          0
alone          0
dtype: int64

## linear Interpolation 

In [18]:
df["age"] = df["age"].fillna(df["age"].interpolate(method = "linear"))

## KNN imputation 

In [16]:
from sklearn.impute import KNNImputer

# Load the Titanic dataset
titanic_data = sns.load_dataset("titanic")

# Print the initial count of null values in the 'age' column
print("Null values before imputation:")
print(titanic_data['age'].isnull().sum())

# Reshape the 'age' column to a 2D array bcz it accepts only 2d
age_column = titanic_data[['age']]  # Extract the 'age' column
age_2d = age_column.values.reshape(-1, 1)

# Initialize the KNNImputer with the desired number of neighbors
knn_imputer = KNNImputer(n_neighbors=5)

# Perform KNN imputation on the 'age' column
imputed_age = knn_imputer.fit_transform(age_2d)

# Update the 'age' column with the imputed values
titanic_data['age'] = imputed_age

# Print the count of null values after imputation
print("Null values after imputation:")
print(titanic_data['age'].isnull().sum())


Null values before imputation:
177
Null values after imputation:
0


## multi variate imputation

In [18]:


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer (max_iter = 10, random_state=0)

df["age"] = imputer.fit_transform(age_2d)