In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the train and test datasets
train_df = pd.read_csv("D:/Downloads/train.csv")
test_df = pd.read_csv("D:/Downloads/test.csv")

# Fill missing Age values with "not provided" placeholder
train_df['Age'] = train_df['Age'].fillna('not provided')
test_df['Age'] = test_df['Age'].fillna('not provided')

# Function to generate unique alphanumeric codes for missing cabin values
def fill_missing_cabins(df):
    missing_cabin_indices = df[df['Cabin'].isnull()].index
    unique_codes = [f'C{i}' for i in range(1, len(missing_cabin_indices) + 1)]
    df.loc[missing_cabin_indices, 'Cabin'] = unique_codes
    return df

# Apply the function to train and test dataframes
train_df = fill_missing_cabins(train_df)
test_df = fill_missing_cabins(test_df)

# Fill missing Embarked values with the mode
train_df['Embarked'].fillna(train_df['Embarked'].mode()[0], inplace=True)

# Fill missing Fare value in the test set with the median fare
test_df['Fare'].fillna(test_df['Fare'].median(), inplace=True)

# Extract titles from the names
train_df['Title'] = train_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())
test_df['Title'] = test_df['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

# Create a family size feature
train_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['FamilySize'] = train_df['SibSp'] + train_df['Parch'] + 1

# Handle the 'Age' column which now contains the placeholder 'not provided'
train_df['Age_is_provided'] = train_df['Age'] != 'not provided'
test_df['Age_is_provided'] = test_df['Age'] != 'not provided'

# Convert 'Age' back to numeric where possible
train_df['Age'] = pd.to_numeric(train_df['Age'], errors='coerce')
test_df['Age'] = pd.to_numeric(test_df['Age'], errors='coerce')

# Scaling numerical features (keeping original for readability)
scaler = StandardScaler()
numerical_features = ['Age', 'Fare', 'FamilySize']

train_df_scaled = train_df.copy()
test_df_scaled = test_df.copy()

train_df_scaled[numerical_features] = scaler.fit_transform(train_df_scaled[numerical_features])
test_df_scaled[numerical_features] = scaler.transform(test_df_scaled[numerical_features])

# Display the first few rows of the modified training dataset for readability
print(train_df.head())

# Display the first few rows of the modified test dataset for readability
print(test_df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked Title  FamilySize  \
0      0         A/5 21171   7.2500    C1        S    Mr           2   
1      0          PC 17599  71.2833   C85        C   Mrs           2   
2      0  STON/O2. 3101282   7.9250    C2        S  Miss           1   
3      0            113803  53.1