# Intermediate Tasks (Week 2) – Data Wrangling & Exploration

1. Data Cleaning

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
titanic = sns.load_dataset("titanic")

# first few rows
titanic.head()


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
# Overview of columns and types
titanic.info()

print("\nMissing values in each column:\n")
print(titanic.isnull().sum())

In [68]:
# Detect and filter outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    print(f"Q1:{Q1}, Q3: {Q3}, IQR: {IQR}, Lower Bound: {lower}, Upper Bound: {upper}")
    return df[(df[column] >= lower) & (df[column] <= upper)]

# Remove outliers from 'fare'
titanic = remove_outliers_iqr(titanic, 'fare')

# Remove outliers from 'age' 
titanic = remove_outliers_iqr(titanic, 'age')

Q1:7.9104, Q3: 31.0, IQR: 23.0896, Lower Bound: -26.724, Upper Bound: 65.6344
Q1:20.0, Q3: 37.0, IQR: 17.0, Lower Bound: -5.5, Upper Bound: 62.5


In [None]:
sns.boxplot(x=titanic['fare'])
plt.title("Fare (After Removing Outliers)")
plt.show()

sns.boxplot(x=titanic['age'])
plt.title("Age (After Removing Outliers)")
plt.show()

In [None]:
# Convert Data Types
# Create a dummy 'boarding_date' column
titanic['boarding_date'] = '1912-04-10'

# Convert to datetime format
titanic['boarding_date'] = pd.to_datetime(titanic['boarding_date'])

# Check types
titanic.dtypes

In [71]:
# Rename Columns, Drop Columns, and Reset Index
titanic.rename(columns={
    'sex': 'gender',
    'pclass': 'passenger_class',
    'sibsp': 'siblings_spouses',
    'parch': 'parents_children'
}, inplace=True)

In [72]:
# Drop columns not needed (e.g., embark_town, who, alive)
titanic.drop(columns=['embark_town', 'who', 'alive'], inplace=True)
titanic.reset_index(drop=True, inplace=True)

 2. Exploratory Data Analysis (EDA)

In [None]:
# Analyze Feature Relationships Using Correlation Heatmap
numeric_features = titanic.select_dtypes(include=['int64', 'float64'])

# Compute correlation matrix
corr_matrix = numeric_features.corr()

# Plot heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap - Numerical Features")
plt.show()

In [74]:
# Create Pivot Tables and GroupBy Summaries
## Average survival rate by gender
titanic.groupby("gender")["survived"].mean()

gender
female    0.688442
male      0.194030
Name: survived, dtype: float64

In [75]:
#Pivot table: Survival rate by gender and passenger class
pd.pivot_table(
    titanic,
    values='survived',
    index='gender',
    columns='passenger_class',
    aggfunc='mean',
    margins=True
)

passenger_class,1,2,3,All
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.958333,0.918919,0.455446,0.688442
male,0.416667,0.163043,0.152,0.19403
All,0.571429,0.5,0.239316,0.357737


In [76]:
# Group by multiple columns
# Survival by gender and class
titanic.groupby(['gender', 'passenger_class'])['survived'].agg(['count', 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean
gender,passenger_class,Unnamed: 2_level_1,Unnamed: 3_level_1
female,1,24,0.958333
female,2,74,0.918919
female,3,101,0.455446
male,1,60,0.416667
male,2,92,0.163043
male,3,250,0.152


In [None]:
# Plot Multiple Variables Using pairplot() and sns.heatmap()
selected_cols = ['survived', 'age', 'fare', 'passenger_class']

sns.pairplot(titanic[selected_cols], hue='survived', palette='Set1')
plt.suptitle("Pairplot - Age, Fare, Class vs Survival", y=1.02)
plt.show()

# Create crosstab
heatmap_data = pd.crosstab(titanic['passenger_class'], titanic['gender'], values=titanic['survived'], aggfunc='mean')

# Plot heatmap
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu')
plt.title("Survival Rate Heatmap by Class and Gender")
plt.show()

3. Feature Engineering

In [78]:
# Create new features (e.g., extract day/month/year from date).
titanic['boarding_day'] = titanic['boarding_date'].dt.day
titanic['boarding_month'] = titanic['boarding_date'].dt.month
titanic['boarding_year'] = titanic['boarding_date'].dt.year

# Creating Age Group (e.g., Child, Adult, Senior)
def categorize_age(age):
    if age < 18:
        return 'Child'
    elif age < 60:
        return 'Adult'
    else:
        return 'Senior'

titanic['age_group'] = titanic['age'].apply(categorize_age)

# Family Size Feature
titanic['family_size'] = titanic['siblings_spouses'] + titanic['parents_children'] + 1


In [79]:
# Normalize fare and age using Min-Max Scaling

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# Fit and transform selected numeric columns
titanic[['age_scaled', 'fare_scaled']] = scaler.fit_transform(titanic[['age', 'fare']])

In [None]:
# Encode Categorical Variables
## One-hot encode gender and embarked
titanic = pd.get_dummies(titanic, columns=['gender', 'embarked', 'age_group'], drop_first=True)
# print(titanic.head())

In [83]:
## Label Encoding 
from sklearn.preprocessing import LabelEncoder

# Label encode 'class' (passenger_class) if needed
le = LabelEncoder()
titanic['passenger_class_encoded'] = le.fit_transform(titanic['passenger_class'])

In [87]:
print(titanic[['passenger_class', 'passenger_class_encoded']].head())

   passenger_class  passenger_class_encoded
0                3                        2
1                3                        2
2                1                        0
3                3                        2
4                1                        0
