In [18]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from scipy import stats

In [19]:
# Load Titanic dataset from dsLAB file
df = pd.read_csv("titanic.csv")


In [20]:

# Step 1: Handle missing values
print("Missing values before handling:\n", df.isnull().sum())
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.drop(columns=['Cabin'], inplace=True)
print("Missing values after handling:\n", df.isnull().sum())


Missing values before handling:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Missing values after handling:
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [21]:
# Step 2: Handle categorical values
print("Columns before encoding:", df.columns)
categorical_cols = ['Sex', 'Embarked']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)  # One-hot encoding
print("Columns after encoding:", df.columns)

Columns before encoding: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked'],
      dtype='object')
Columns after encoding: Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S'],
      dtype='object')


In [22]:
# Step 3: Scale the features
numerical_cols = ['Age', 'Fare', 'SibSp', 'Parch']
print("Before Scaling:\n", df[numerical_cols].head())
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
print("After Scaling:\n", df[numerical_cols].head())

Before Scaling:
     Age     Fare  SibSp  Parch
0  22.0   7.2500      1      0
1  38.0  71.2833      1      0
2  26.0   7.9250      0      0
3  35.0  53.1000      1      0
4  35.0   8.0500      0      0
After Scaling:
         Age      Fare     SibSp     Parch
0 -0.565736 -0.502445  0.432793 -0.473674
1  0.663861  0.786845  0.432793 -0.473674
2 -0.258337 -0.488854 -0.474545 -0.473674
3  0.433312  0.420730  0.432793 -0.473674
4  0.433312 -0.486337 -0.474545 -0.473674


In [23]:
# Step 4: Remove outliers using Z-score
print("Dataset size before removing outliers:", df.shape)
z_scores = np.abs(stats.zscore(df[numerical_cols]))
df = df[(z_scores < 3).all(axis=1)]
print("Dataset size after removing outliers:", df.shape)


Dataset size before removing outliers: (891, 12)
Dataset size after removing outliers: (820, 12)


In [24]:
# Step 5: Feature selection using ExtraTreesClassifier
X = df.drop(columns=['Survived', 'Name', 'Ticket', 'PassengerId'])
y = df['Survived']

model = ExtraTreesClassifier()
model.fit(X, y)

# Select important features
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
selected_features = feature_importances.nlargest(5).index.tolist()
print("Selected Features:", selected_features)
X_selected = X[selected_features]

Selected Features: ['Sex_male', 'Fare', 'Age', 'Pclass', 'Parch']


In [25]:
# Step 6: PCA (Principal Component Analysis)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_selected)

# Convert PCA result into DataFrame
df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
print("PCA Result:\n", df_pca.head())

PCA Result:
         PC1       PC2
0 -0.940655 -0.407143
1  1.622049  0.559314
2 -0.695756 -0.345756
3  1.336483  0.527158
4 -0.230642 -0.920442
