In [1]:
#Exercise 1: Duplicate Detection and Removal

In [2]:
import pandas as pd

In [3]:
titanic_df = pd.read_csv('titanic.csv')

In [4]:
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Store the original number of rows
original_row_count = titanic_df.shape[0]

In [10]:
# Identify duplicate rows
duplicate_rows = titanic_df[titanic_df.duplicated()]

In [11]:
# Remove duplicate rows
titanic_df_cleaned = titanic_df.drop_duplicates()

In [12]:
# Store the new number of rows after removal
new_row_count = titanic_df_cleaned.shape[0]


In [14]:
# Prepare a summary of the operation
summary = {
    "Original Row Count": original_row_count,
    "Duplicate Rows Found": duplicate_rows.shape[0],
    "New Row Count After Removal": new_row_count
}

duplicate_rows.head(), summary

(Empty DataFrame
 Columns: [PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin, Embarked]
 Index: [],
 {'Original Row Count': 891,
  'Duplicate Rows Found': 0,
  'New Row Count After Removal': 891})

In [17]:
#Exercise 2: Handling Missing Values
#Identify columns in the Titanic dataset with missing values.

from sklearn.impute import SimpleImputer

In [19]:
# 1. Identify columns with missing values
missing_summary = titanic_df.isnull().sum()
columns_with_missing = missing_summary[missing_summary > 0]

In [None]:
#Remove rows with missing values (example: Cabin, which has many NaNs)
df_dropped = titanic_df.dropna(subset=['Cabin'])

In [26]:
df_filled_constant = titanic_df.copy()
df_filled_constant['Embarked'].fillna('S', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled_constant['Embarked'].fillna('S', inplace=True)


In [29]:
imputer = SimpleImputer(strategy='median')
df_imputed = titanic_df.copy()
df_imputed['Age'] = imputer.fit_transform(titanic_df[['Age']])

In [30]:
# Summary of missing values before and examples of handling them
{
    "Missing Values Summary": columns_with_missing,
    "After Dropping 'Cabin' NaNs": df_dropped.shape[0],
    "After Filling 'Embarked'": df_filled_constant['Embarked'].isnull().sum(),
    "After Imputing 'Age'": df_imputed['Age'].isnull().sum()
}

{'Missing Values Summary': Age         177
 Cabin       687
 Embarked      2
 dtype: int64,
 "After Dropping 'Cabin' NaNs": 204,
 "After Filling 'Embarked'": np.int64(0),
 "After Imputing 'Age'": np.int64(0)}

In [None]:
#Exercise 3: Feature Engineering

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [34]:
# Make a copy to work on
titanic_df_features = titanic_df.copy()

In [37]:
# 1. Create 'FamilySize' = SibSp + Parch + 1 (including self)
titanic_df_features['FamilySize'] = titanic_df_features['SibSp'] + titanic_df_features['Parch'] + 1

In [38]:
# 2. Extract 'Title' from 'Name'
titanic_df_features['Title'] = titanic_df_features['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [39]:
# Simplify rare titles
rare_titles = titanic_df_features['Title'].value_counts()[titanic_df_features['Title'].value_counts() < 10].index
titanic_df_features['Title'] = titanic_df_features['Title'].replace(rare_titles, 'Rare')


In [40]:
# 3. Encode categorical features
# Label encode 'Sex' and 'Embarked' (simple)
label_encoder = LabelEncoder()
titanic_df_features['Sex'] = label_encoder.fit_transform(titanic_df_features['Sex'])
titanic_df_features['Embarked'] = titanic_df_features['Embarked'].fillna('S')  # Fill missing if any
titanic_df_features['Embarked'] = label_encoder.fit_transform(titanic_df_features['Embarked'])

In [41]:
# One-hot encode 'Title'
titanic_df_features = pd.get_dummies(titanic_df_features, columns=['Title'])

In [42]:
# 4. Normalize numerical features (example: Age, Fare, FamilySize)
scaler = StandardScaler()
features_to_scale = ['Age', 'Fare', 'FamilySize']
titanic_df_features[features_to_scale] = scaler.fit_transform(titanic_df_features[features_to_scale])

In [43]:
# Show final columns added and transformed
titanic_df_features[['Sex', 'Embarked', 'FamilySize'] + list(titanic_df_features.columns[titanic_df_features.columns.str.startswith("Title_")])].head()

Unnamed: 0,Sex,Embarked,FamilySize,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Rare
0,1,2,0.05916,False,False,True,False,False
1,0,0,0.05916,False,False,False,True,False
2,0,2,-0.560975,False,True,False,False,False
3,0,2,0.05916,False,False,False,True,False
4,1,2,-0.560975,False,False,True,False,False


In [45]:
#Exercise 4: Outlier Detection and Handling

#Use statistical methods to detect outliers in columns like Fare and Age.
import pandas as pd
import numpy as np


In [46]:
# 🔍 Fonction pour détecter les outliers avec la méthode IQR
def detect_iqr_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (column < lower_bound) | (column > upper_bound)


In [48]:
# Identifier les outliers dans 'Fare' et 'Age'
outliers_fare = detect_iqr_outliers(titanic_df['Fare'])
outliers_age = detect_iqr_outliers(titanic_df['Age'])

print("🎯 Outliers initiaux :")
print(f" - Fare : {outliers_fare.sum()} outliers")
print(f" - Age  : {outliers_age.sum()} outliers")


🎯 Outliers initiaux :
 - Fare : 116 outliers
 - Age  : 11 outliers


In [49]:
# ✂️ Fonction pour corriger les outliers (capping)
def cap_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return column.clip(lower=lower_bound, upper=upper_bound)

In [51]:
# Créer de nouvelles colonnes avec les valeurs corrigées
titanic_df['Fare_capped'] = cap_outliers(titanic_df['Fare'])
titanic_df['Age_capped'] = cap_outliers(titanic_df['Age'])

In [53]:
# Vérifier les nouveaux outliers
new_outliers_fare = detect_iqr_outliers(titanic_df['Fare_capped']).sum()
new_outliers_age = detect_iqr_outliers(titanic_df['Age_capped']).sum()


In [54]:
print("\n✅ Après traitement (capping) :")


✅ Après traitement (capping) :


In [55]:
print(f" - Fare : {new_outliers_fare} outliers restants")

 - Fare : 0 outliers restants


In [56]:
print(f" - Age  : {new_outliers_age} outliers restants")


 - Age  : 0 outliers restants
