In [1]:
import pandas as pd

# Load the Titanic dataset
df = pd.read_csv('titanic.csv')


In [2]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Age'].fillna(df['Age'].median(), inplace=True)

In [8]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

In [9]:
df.drop(columns=['Cabin'], inplace=True)

In [10]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [11]:
# Drop irrelevant columns
df.drop(columns=['PassengerId', 'Name', 'Ticket'], inplace=True)



In [12]:
# Create new feature 'FamilySize' from 'SibSp' and 'Parch'
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

In [13]:
# Drop the original 'SibSp' and 'Parch' columns
df.drop(columns=['SibSp', 'Parch'], inplace=True)

In [14]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,22.0,7.25,S,2
1,1,1,female,38.0,71.2833,C,2
2,1,3,female,26.0,7.925,S,1
3,1,1,female,35.0,53.1,S,2
4,0,3,male,35.0,8.05,S,1


In [15]:
from sklearn.preprocessing import StandardScaler

# Normalize 'Age' and 'Fare'
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])


In [16]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,0,3,male,-0.565736,-0.502445,S,2
1,1,1,female,0.663861,0.786845,C,2
2,1,3,female,-0.258337,-0.488854,S,1
3,1,1,female,0.433312,0.42073,S,2
4,0,3,male,0.433312,-0.486337,S,1


In [17]:
# One-hot encode 'Sex' and 'Embarked' columns
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)


In [18]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S
0,0,3,-0.565736,-0.502445,2,True,False,True
1,1,1,0.663861,0.786845,2,False,False,False
2,1,3,-0.258337,-0.488854,1,False,False,True
3,1,1,0.433312,0.42073,2,False,False,True
4,0,3,0.433312,-0.486337,1,True,False,True


In [19]:
df.shape

(891, 8)

In [24]:
# Define age bins and labels
age_bins = [0, 12, 20, 40, 60, 80]
age_labels = ['Child', 'Teenager', 'Adult', 'Middle-aged', 'Senior']

# Create age groups
df['AgeGroup'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)

# One-hot encode the 'AgeGroup' feature
df = pd.get_dummies(df, columns=['AgeGroup'], drop_first=True)




In [25]:
df.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Sex_male,Embarked_Q,Embarked_S,IsAlone,AgeGroup_Teenager,AgeGroup_Adult,AgeGroup_Middle-aged,AgeGroup_Senior
0,0,3,-0.565736,-0.502445,True,False,True,1,False,False,False,False
1,1,1,0.663861,0.786845,False,False,False,1,False,False,False,False
2,1,3,-0.258337,-0.488854,False,False,True,1,False,False,False,False
3,1,1,0.433312,0.42073,False,False,True,1,False,False,False,False
4,0,3,0.433312,-0.486337,True,False,True,1,False,False,False,False
