In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
gender_submission_df = pd.read_csv('gender_submission.csv')

In [4]:
print("Train Dataset Head:")
print(train_df.head())

print("\nTest Dataset Head:")
print(test_df.head())

print("\nGender Submission Dataset Head:")
print(gender_submission_df.head())


Train Dataset Head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500

In [5]:
print("\nMissing Values in Train Dataset:")
print(train_df.isnull().sum())


Missing Values in Train Dataset:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [6]:
print("\nMissing Values in Test Dataset:")
print(test_df.isnull().sum())


Missing Values in Test Dataset:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [7]:
imputer_age = SimpleImputer(strategy='median')
train_df['Age'] = imputer_age.fit_transform(train_df[['Age']])
test_df['Age'] = imputer_age.transform(test_df[['Age']])

In [9]:
imputer_embarked = SimpleImputer(strategy='most_frequent')
train_df['Embarked'] = imputer_embarked.fit_transform(train_df[['Embarked']])[:, 0] # Extract the first (and only) column from the 2D result

In [10]:
imputer_fare = SimpleImputer(strategy='median')
test_df['Fare'] = imputer_fare.fit_transform(test_df[['Fare']])

In [11]:
train_df['Cabin'].fillna('Missing', inplace=True)
test_df['Cabin'].fillna('Missing', inplace=True)

In [12]:
print("\nMissing Values in Train Dataset After Imputation:")
print(train_df.isnull().sum())

print("\nMissing Values in Test Dataset After Imputation:")
print(test_df.isnull().sum())


Missing Values in Train Dataset After Imputation:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

Missing Values in Test Dataset After Imputation:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [13]:
train_df['Title'] = train_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [14]:
rare_titles = train_df['Title'].value_counts() < 10
train_df['Title'] = train_df['Title'].apply(lambda x: 'Rare' if rare_titles[x] else x)
test_df['Title'] = test_df['Title'].apply(lambda x: 'Rare' if x not in train_df['Title'].unique() else x)

In [15]:
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = test_df['SibSp'] + test_df['Parch'] + 1

In [16]:
train_df['IsAlone'] = np.where(train_df['FamilySize'] == 1, 1, 0)
test_df['IsAlone'] = np.where(test_df['FamilySize'] == 1, 1, 0)

In [17]:
categorical_features = ['Sex', 'Embarked', 'Cabin', 'Title']
train_df = pd.get_dummies(train_df, columns=categorical_features, drop_first=True)
test_df = pd.get_dummies(test_df, columns=categorical_features, drop_first=True)

In [18]:
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

In [19]:
scaler = StandardScaler()
train_df[['Age', 'Fare']] = scaler.fit_transform(train_df[['Age', 'Fare']])
test_df[['Age', 'Fare']] = scaler.transform(test_df[['Age', 'Fare']])

In [20]:
print("\nProcessed Train Dataset Head:")
print(train_df.head())

print("\nProcessed Test Dataset Head:")
print(test_df.head())


Processed Train Dataset Head:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name       Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris -0.565736      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  0.663861      1      0   
2                             Heikkinen, Miss. Laina -0.258337      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  0.433312      1      0   
4                           Allen, Mr. William Henry  0.433312      0      0   

             Ticket      Fare  FamilySize  ...  Cabin_F33  Cabin_F38  \
0         A/5 21171 -0.502445           2  ...      False      False   
1          PC 17599  0.786845           2  ...      False      False   
2  STON/O2. 3101282 -0.488854           1  ...      Fal