## Pipeline for Advanced Imputation and Feature Engineering 

In [1]:
import pandas as pd 
import numpy as np

In [2]:
train_val_df_raw = pd.read_csv('/Users/thananpornsethjinda/Desktop/internship/projects/titanic-survival-prediction/data/train.csv')
test_df_raw = pd.read_csv('/Users/thananpornsethjinda/Desktop/internship/projects/titanic-survival-prediction/data/test.csv')

In [4]:
def clean_dataframe(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    df = df.copy()

    # log helper
    def log(msg):
        if verbose:
            print(f"[INFO] {msg}")

    # 1. standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    log("Standardized column names.")

    # 2. remove exact duplicates
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        df.drop_duplicates(inplace=True)
        log(f"Removed {dup_count} duplicate rows.")

    # 3. trim and lowercase all string (object) values
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype(str).str.strip().str.lower()
    log("Standardized string columns (lowercase + trimmed).")

    # 4. detect missing values (including blanks and placeholders)
    placeholder_values = ['n/a', 'na', '--', '-', 'none', 'null', '', 'nan']
    df.replace(placeholder_values, np.nan, inplace=True)
    null_report = df.isnull().sum()
    null_report = null_report[null_report > 0]
    if not null_report.empty:
        log(f"Missing values found in columns:\n{null_report}")

    # 5. flag constant columns
    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    if constant_cols:
        log(f"Constant columns (consider removing): {constant_cols}")

    # 6. flag high cardinality categorical columns
    high_card_cols = [col for col in df.select_dtypes(include='object') if df[col].nunique() > 100]
    if high_card_cols:
        log(f"High-cardinality columns (consider encoding strategies): {high_card_cols}")

    # 7. detect numeric outliers using IQR
    num_cols = df.select_dtypes(include=np.number).columns
    outlier_report = {}
    for col in num_cols:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower) | (df[col] > upper)][col].count()
        if outliers > 0:
            outlier_report[col] = outliers
    if outlier_report:
        log(f"Potential numeric outliers detected:\n{outlier_report}")
    
    # 8. convert applicable columns to category
    for col in df.select_dtypes(include='object'):
        n_unique = df[col].nunique()
        if n_unique < len(df) * 0.05:
            df[col] = df[col].astype('category')
    log("Converted suitable object columns to category dtype.")

    log("Data cleaning complete.")
    return df


In [5]:

train_val_df = clean_dataframe(train_val_df_raw)

print("-" * 100)

test_df = clean_dataframe(test_df_raw)

[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Missing values found in columns:
age         177
cabin       687
embarked      2
dtype: int64
[INFO] High-cardinality columns (consider encoding strategies): ['name', 'ticket', 'cabin']
[INFO] Potential numeric outliers detected:
{'age': np.int64(11), 'sibsp': np.int64(46), 'parch': np.int64(213), 'fare': np.int64(116)}
[INFO] Converted suitable object columns to category dtype.
[INFO] Data cleaning complete.
----------------------------------------------------------------------------------------------------
[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Missing values found in columns:
age       86
fare       1
cabin    327
dtype: int64
[INFO] High-cardinality columns (consider encoding strategies): ['name', 'ticket']
[INFO] Potential numeric outliers detected:
{'age': np.int64(2), 'sibsp': np.int64(11), 'parch': np.int64(94), 'fare': n

In [6]:
combined = pd.concat([train_val_df, test_df])

In [7]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)

In [9]:
X_train, y_train = train_df[['pclass', 'sex', 'age', 'embarked','sibsp','parch','name','ticket','fare']], train_df['survived']
X_val, y_val = val_df[['pclass', 'sex', 'age', 'embarked','sibsp','parch','name','ticket','fare']], val_df['survived']
X_test = test_df[['pclass', 'sex', 'age', 'embarked','sibsp','parch','name','ticket','fare']]

### Manual Age Imputation

In [10]:
TitleDict = {"capt": "officer","col": "officer","major": "officer","jonkheer": "royalty", \
             "don": "royalty", "sir" : "royalty","dr": "royalty","rev": "royalty", \
             "countess":"royalty", "mme": "mrs", "mlle": "miss", "ms": "mrs","mr" : "mr", \
             "mrs" : "mrs","miss" : "miss","master" : "master","lady" : "royalty"}

def preprocessing_feature_eng(df): 

    df = df.copy()

    df['title'] = df.name.str.extract(r' ([A-Za-z]+)\.', expand=False)

    df['ticket_people'] = df['ticket'].map(combined['ticket'].value_counts())

    df['ticket_per_person'] = df['fare']/df['ticket_people']

    df['title'] = df.title.map(TitleDict)

    df.loc[(df.title == 'miss') & (df.parch != 0) & (df.ticket_people > 1), 'title'] = "female_child"

    return df

In [17]:
X_train_eg = preprocessing_feature_eng(X_train)

X_val_eg = preprocessing_feature_eng(X_val)

X_test_eg = preprocessing_feature_eng(X_test)

In [29]:
X_test_eg.isnull().sum()

X_test_eg.at[414, 'title'] = 'royalty'

In [21]:
imp_table = X_train_eg.groupby(['pclass', 'sex', 'title'], observed=True)['age'].mean().reset_index()[['pclass', 'sex', 'title', 'age']]

In [22]:
def fill_age(x): 
    return imp_table[(imp_table.sex == x.sex) & (imp_table.pclass == x.pclass) & (imp_table.title == x.title)]['age'].values[0]


In [23]:
for df in [X_train_eg, X_val_eg, X_test_eg]: 
    df['age'] = df.apply(lambda x: fill_age(x) if np.isnan(x['age']) else x['age'], axis = 1)

In [None]:
X_test_eg.isnull().sum()

pclass               0
sex                  0
age                  0
embarked             0
sibsp                0
parch                0
name                 0
ticket               0
fare                 1
title                0
ticket_people        0
ticket_per_person    1
dtype: int64

In [31]:
X_val_eg.isnull().sum()

pclass               0
sex                  0
age                  0
embarked             0
sibsp                0
parch                0
name                 0
ticket               0
fare                 0
title                0
ticket_people        0
ticket_per_person    0
dtype: int64

In [28]:
X_train_eg.isnull().sum()

pclass               0
sex                  0
age                  0
embarked             2
sibsp                0
parch                0
name                 0
ticket               0
fare                 0
title                0
ticket_people        0
ticket_per_person    0
dtype: int64

In [32]:
from sklearn.impute import SimpleImputer

impute = SimpleImputer(strategy='most_frequent')

X_train_eg[['embarked']] = impute.fit_transform(X_train_eg[['embarked']])

In [34]:
X_test_eg[X_test_eg['fare'].isnull()]

Unnamed: 0,pclass,sex,age,embarked,sibsp,parch,name,ticket,fare,title,ticket_people,ticket_per_person
152,3,male,60.5,s,0,0,"storey, mr. thomas",3701,,mr,1,


In [36]:
X_test_eg.loc[152, ['fare', 'ticket_per_person']] = X_train_eg['ticket_per_person'].mean()

### Pipeline 

In [40]:
X_train_eg

Unnamed: 0,pclass,sex,age,embarked,sibsp,parch,name,ticket,fare,title,ticket_people,ticket_per_person
331,1,male,45.500000,s,0,0,"partner, mr. austen",113043,28.5000,mr,1,28.500000
733,2,male,23.000000,s,0,0,"berriman, mr. william john",28425,13.0000,mr,1,13.000000
382,3,male,32.000000,s,0,0,"tikkanen, mr. juho",ston/o 2. 3101293,7.9250,mr,1,7.925000
704,3,male,26.000000,s,1,0,"hansen, mr. henrik juul",350025,7.8542,mr,1,7.854200
813,3,female,6.000000,s,4,2,"andersson, miss. ebba iris alfrida",347082,31.2750,female_child,7,4.467857
...,...,...,...,...,...,...,...,...,...,...,...,...
106,3,female,21.000000,s,0,0,"salkjelsvik, miss. anna kristine",343120,7.6500,miss,1,7.650000
270,1,male,40.431818,s,0,0,"cairns, mr. alexander",113798,31.0000,mr,2,15.500000
860,3,male,41.000000,s,2,0,"hansen, mr. claus peter",350026,14.1083,mr,2,7.054150
435,1,female,14.000000,s,1,2,"carter, miss. lucile polk",113760,120.0000,female_child,4,30.000000


In [41]:
PASSTHROUGH_COLS = ['pclass']
NUMERIC_COLS = ['age', 'sibsp', 'parch', 'ticket_people', 'ticket_per_person']
CATEGORICAL_COLS = ['sex', 'embarked']

In [42]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline(steps=[
    ('scale', StandardScaler())
])

cat_pipeline = Pipeline(steps=[
    ('one-hot-encoding', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False))
])

In [43]:
from sklearn.compose import ColumnTransformer

col_transform = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, NUMERIC_COLS), 
    ('cat_pipeline', cat_pipeline, CATEGORICAL_COLS), 
    ('passthrough_cols', 'passthrough', PASSTHROUGH_COLS)
], remainder='drop')

In [52]:
from sklearn.metrics import confusion_matrix, f1_score

def fit_and_print(p, X_train=X_train_eg, y_train=y_train, X_val=X_val_eg, y_val=y_val):

    p.fit(X_train, y_train) 
    train_preds = p.predict(X_train)
    val_preds = p.predict(X_val)
    print("Training Error:")
    print(confusion_matrix(y_train, train_preds, normalize='true'))
    print(f1_score(y_train, train_preds))
    print("-" * 30)
    print("Validation Error:")
    print(confusion_matrix(y_val, val_preds, normalize='true'))
    print(f1_score(y_val, val_preds))

def test_predictions(p, X_train=X_train_eg, y_train=y_train, X_test=X_test_eg): 
    p.fit(X_train, y_train) 
    predictions = p.predict(X_test)
    return predictions

In [57]:
# Linear Discriminant Analysis

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.pipeline import make_pipeline

ldr = LinearDiscriminantAnalysis()

ldr_pipeline = make_pipeline(col_transform, ldr)

# Testing

fit_and_print(ldr_pipeline)

Training Error:
[[0.86261261 0.13738739]
 [0.30970149 0.69029851]]
0.7198443579766537
------------------------------
Validation Error:
[[0.83809524 0.16190476]
 [0.28378378 0.71621622]]
0.7361111111111112


In [53]:
# Random Forest 

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

rfc = RandomForestClassifier()

rfc_pipeline = make_pipeline(col_transform, rfc)

# Testing

fit_and_print(rfc_pipeline)



Training Error:
[[0.99324324 0.00675676]
 [0.04104478 0.95895522]]
0.9734848484848485
------------------------------
Validation Error:
[[0.84761905 0.15238095]
 [0.22972973 0.77027027]]
0.7755102040816326


In [58]:
output = pd.DataFrame({'PassengerId': test_df_raw.PassengerId, 'Survived': test_predictions(ldr_pipeline)})



In [60]:
output.to_csv('submission5.csv', index=False)