# 🧹 Task 1: Data Cleaning & Preprocessing
This notebook covers:
- Importing and exploring the dataset
- Handling missing values
- Encoding categorical features
- Normalizing/standardizing numerical features
- Detecting and removing outliers

Dataset: Titanic Dataset

In [5]:

# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scikit-learn.preprocessing import StandardScaler, MinMaxScaler

# Show all columns in pandas
pd.set_option("display.max_columns", None)


SyntaxError: invalid syntax (3575295231.py, line 6)

In [None]:

# Step 2: Load the dataset (download from Kaggle if not available)
 #Link: ://www.kaggle.com/dhttpsatasets/yasserh/titanic-dataset
# Make sure to keep 'titanic.csv' in the same folder as this notebook
df = pd.read_csv("titanic.csv")

# Display first few rows
df.head()


In [None]:

# Step 3: Basic Info about Dataset
print("Shape of dataset:", df.shape)
print("\nDataset Info:")
print(df.info())

print("\nMissing values per column:")
print(df.isnull().sum())


In [None]:

# Step 4: Handle Missing Values

# Fill missing 'Age' with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Embarked' with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' due to too many missing values
df.drop(columns=['Cabin'], inplace=True)

print("Missing values after cleaning:")
print(df.isnull().sum())


In [None]:

# Step 5: Encode Categorical Variables

# One-Hot Encoding for 'Sex' and 'Embarked'
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

df.head()


In [None]:

# Step 6: Normalize/Standardize Numerical Columns

scaler = StandardScaler()
numerical_features = ['Age', 'Fare']
df[numerical_features] = scaler.fit_transform(df[numerical_features])

df.head()


In [None]:

# Step 7: Outlier Detection using Boxplot

plt.figure(figsize=(12, 5))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(1, 2, i)
    sns.boxplot(x=df[col])
    plt.title(f"Boxplot of {col}")
plt.tight_layout()
plt.show()


In [None]:

# Step 8: Remove Outliers using IQR Method

for col in numerical_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]

print("Shape after removing outliers:", df.shape)


In [None]:

# Step 9: Save Cleaned Dataset
df.to_csv("titanic_cleaned.csv", index=False)
print("✅ Cleaned dataset saved as 'titanic_cleaned.csv'")
