<a href="https://colab.research.google.com/github/kaylaniBatin/DI-Bootcamp/blob/main/Week3Day3XPexercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Exercise 1: Duplicate Detection and Removal

In [1]:
from google.colab import files
uploaded = files.upload()


Saving titanic dataset.zip to titanic dataset.zip


In [2]:
import zipfile
import os

# Extract the zip file
with zipfile.ZipFile("titanic dataset.zip", "r") as zip_ref:
    zip_ref.extractall("titanic_data")

# List the extracted files
os.listdir("titanic_data")


['titanic dataset']

In [3]:
import os

# List files inside the extracted folder
os.listdir("titanic_data/titanic dataset")


['gender_submission.csv', 'test.csv', 'train.csv']

In [4]:
import pandas as pd

# Load the training dataset
df = pd.read_csv("titanic_data/titanic dataset/train.csv")

# Show the first few rows
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
print("Original shape:", df.shape)

duplicate_rows = df.duplicated()
print("Number of duplicate rows:", duplicate_rows.sum())

df = df.drop_duplicates()

print("New shape after removing duplicates:", df.shape)


Original shape: (891, 12)
Number of duplicate rows: 0
New shape after removing duplicates: (891, 12)


 Exercise 2: Handling Missing Values

In [6]:
df.isnull().sum()


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [7]:
df = df.dropna(subset=['Embarked'])


In [8]:
df['Age'] = df['Age'].fillna(df['Age'].median())


In [9]:
df.isnull().sum()


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


In [11]:

df['Cabin'] = df['Cabin'].fillna('Unknown')


In [12]:
df.isnull().sum()


Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,0
SibSp,0
Parch,0
Ticket,0
Fare,0


 Exercise 3: Feature Engineering

In [13]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1


In [14]:
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


In [15]:
df = pd.get_dummies(df, columns=['Title', 'Embarked'], prefix=['Title', 'Embarked'])


In [16]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Sex_encoded'] = le.fit_transform(df['Sex'])


In [17]:
df[['Sex', 'Sex_encoded', 'FamilySize']].head()


Unnamed: 0,Sex,Sex_encoded,FamilySize
0,male,1,2
1,female,0,2
2,female,0,1
3,female,0,2
4,male,1,1


Exercise 4: Outlier Detection and Handling

In [18]:
def detect_outliers_iqr(column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return lower_bound, upper_bound

# For Fare
fare_low, fare_high = detect_outliers_iqr('Fare')
print(f"Fare bounds: {fare_low:.2f} to {fare_high:.2f}")

# For Age
age_low, age_high = detect_outliers_iqr('Age')
print(f"Age bounds: {age_low:.2f} to {age_high:.2f}")


Fare bounds: -26.76 to 65.66
Age bounds: 2.50 to 54.50


In [19]:
df['Fare'] = df['Fare'].clip(lower=fare_low, upper=fare_high)
df['Age'] = df['Age'].clip(lower=age_low, upper=age_high)


In [20]:
print("Max Fare after capping:", df['Fare'].max())
print("Max Age after capping:", df['Age'].max())


Max Fare after capping: 65.6563
Max Age after capping: 54.5


 Exercise 5: Data Standardization and Normalization

In [21]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Create scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

# Columns to transform
cols_to_scale = ['Age', 'Fare', 'FamilySize']

# Standardization
df_std = df.copy()
df_std[cols_to_scale] = standard_scaler.fit_transform(df_std[cols_to_scale])

# Normalization
df_norm = df.copy()
df_norm[cols_to_scale] = minmax_scaler.fit_transform(df_norm[cols_to_scale])


In [22]:
print("Standardized values:")
print(df_std[cols_to_scale].head())

print("\nNormalized values:")
print(df_norm[cols_to_scale].head())


Standardized values:
        Age      Fare  FamilySize
0 -0.581209 -0.818784    0.057853
1  0.747162  2.043778    0.057853
2 -0.249116 -0.785701   -0.561804
3  0.498092  1.428379    0.057853
4  0.498092 -0.779575   -0.561804

Normalized values:
        Age      Fare  FamilySize
0  0.375000  0.110424         0.1
1  0.682692  1.000000         0.1
2  0.451923  0.120704         0.0
3  0.625000  0.808757         0.1
4  0.625000  0.122608         0.0


 Exercise 6: Feature Encoding

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Sex_encoded'] = le.fit_transform(df['Sex'])


In [25]:
print(df.columns)


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'FamilySize', 'Title_Capt',
       'Title_Col', 'Title_Countess', 'Title_Don', 'Title_Dr',
       'Title_Jonkheer', 'Title_Lady', 'Title_Major', 'Title_Master',
       'Title_Miss', 'Title_Mlle', 'Title_Mme', 'Title_Mr', 'Title_Mrs',
       'Title_Ms', 'Title_Rev', 'Title_Sir', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Sex_encoded'],
      dtype='object')


In [26]:
df = df.drop(columns=['Sex'])


In [27]:
print(df.head())
print(df.columns)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   
2                             Heikkinen, Miss. Laina  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   
4                           Allen, Mr. William Henry  35.0      0      0   

             Ticket     Fare    Cabin  ...  Title_Mme  Title_Mr  Title_Mrs  \
0         A/5 21171   7.2500  Unknown  ...      False      True      False   
1          PC 17599  65.6563      C85  ...      False     False       True   
2  STON/O2. 3101282   7.9250  Unknown  ...      False     False      False   
3            1

Exercise 7: Data Transformation for Age Feature

In [28]:
bins = [0, 12, 18, 35, 60, 80]
labels = ['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior']

df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

df = pd.get_dummies(df, columns=['AgeGroup'])


In [29]:
df.filter(like='AgeGroup').head()


Unnamed: 0,AgeGroup_Child,AgeGroup_Teenager,AgeGroup_Adult,AgeGroup_Middle-aged,AgeGroup_Senior
0,False,False,True,False,False
1,False,False,False,True,False
2,False,False,True,False,False
3,False,False,False,True,False
4,False,False,False,True,False


In [30]:
# Define bins and labels
bins = [0, 12, 18, 35, 60, 80]
labels = ['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior']

# Create age group column
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)

# One-hot encode age groups
df = pd.get_dummies(df, columns=['AgeGroup'])

# Preview the one-hot encoded age groups
print(df.filter(like='AgeGroup').head())


   AgeGroup_Child  AgeGroup_Teenager  AgeGroup_Adult  AgeGroup_Middle-aged  \
0           False              False            True                 False   
1           False              False           False                  True   
2           False              False            True                 False   
3           False              False           False                  True   
4           False              False           False                  True   

   AgeGroup_Senior  AgeGroup_Child  AgeGroup_Teenager  AgeGroup_Adult  \
0            False           False              False            True   
1            False           False              False           False   
2            False           False              False            True   
3            False           False              False           False   
4            False           False              False           False   

   AgeGroup_Middle-aged  AgeGroup_Senior  
0                 False            False  
1     