## Prediction Pipeline with Iterative Imputation and Random Forests

In [165]:
import pandas as pd 
import numpy as np

In [166]:
def clean_dataframe(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    df = df.copy()

    # log helper
    def log(msg):
        if verbose:
            print(f"[INFO] {msg}")

    # 1. standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    log("Standardized column names.")

    # 2. remove exact duplicates
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        df.drop_duplicates(inplace=True)
        log(f"Removed {dup_count} duplicate rows.")

    # 3. trim and lowercase all string (object) values
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype(str).str.strip().str.lower()
    log("Standardized string columns (lowercase + trimmed).")

    # 4. detect missing values (including blanks and placeholders)
    placeholder_values = ['n/a', 'na', '--', '-', 'none', 'null', '', 'nan']
    df.replace(placeholder_values, np.nan, inplace=True)
    null_report = df.isnull().sum()
    null_report = null_report[null_report > 0]
    if not null_report.empty:
        log(f"Missing values found in columns:\n{null_report}")

    # 5. flag constant columns
    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    if constant_cols:
        log(f"Constant columns (consider removing): {constant_cols}")

    # 6. flag high cardinality categorical columns
    high_card_cols = [col for col in df.select_dtypes(include='object') if df[col].nunique() > 100]
    if high_card_cols:
        log(f"High-cardinality columns (consider encoding strategies): {high_card_cols}")

    # 7. detect numeric outliers using IQR
    num_cols = df.select_dtypes(include=np.number).columns
    outlier_report = {}
    for col in num_cols:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower) | (df[col] > upper)][col].count()
        if outliers > 0:
            outlier_report[col] = outliers
    if outlier_report:
        log(f"Potential numeric outliers detected:\n{outlier_report}")
    
    # 8. convert applicable columns to category
    for col in df.select_dtypes(include='object'):
        n_unique = df[col].nunique()
        if n_unique < len(df) * 0.05:
            df[col] = df[col].astype('category')
    log("Converted suitable object columns to category dtype.")

    log("Data cleaning complete.")
    return df


In [167]:
train_val_df_raw = pd.read_csv('/Users/thananpornsethjinda/Desktop/internship/projects/titanic-survival-prediction/data/train.csv')
test_df_raw = pd.read_csv('/Users/thananpornsethjinda/Desktop/internship/projects/titanic-survival-prediction/data/test.csv')

In [168]:
train_val_df = clean_dataframe(train_val_df_raw)

print("-" * 100)

test_df = clean_dataframe(test_df_raw)

[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Missing values found in columns:
age         177
cabin       687
embarked      2
dtype: int64
[INFO] High-cardinality columns (consider encoding strategies): ['name', 'ticket', 'cabin']
[INFO] Potential numeric outliers detected:
{'age': np.int64(11), 'sibsp': np.int64(46), 'parch': np.int64(213), 'fare': np.int64(116)}
[INFO] Converted suitable object columns to category dtype.
[INFO] Data cleaning complete.
----------------------------------------------------------------------------------------------------
[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Missing values found in columns:
age       86
fare       1
cabin    327
dtype: int64
[INFO] High-cardinality columns (consider encoding strategies): ['name', 'ticket']
[INFO] Potential numeric outliers detected:
{'age': np.int64(2), 'sibsp': np.int64(11), 'parch': np.int64(94), 'fare': n

In [169]:
# Splitting of the training data (given) into train and validation sets 
# The test set remains untouched 

# from sklearn.model_selection import train_test_split

# train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)

In [170]:
# X_train, y_train = train_df[['pclass', 'sex', 'age', 'embarked']], train_df['survived']
# X_val, y_val = val_df[['pclass', 'sex', 'age', 'embarked']], val_df['survived']
# X_test = test_df[['pclass', 'sex', 'age', 'embarked']]

In [171]:
X_train, y_train = train_val_df[['pclass', 'sex', 'age', 'embarked']], train_val_df['survived']
X_test = test_df[['pclass', 'sex', 'age', 'embarked']]

In [172]:
CATEGORICAL_COLUMNS = ['sex', 'embarked']

NUMERICAL_COLUMNS = ['age',  'pclass']

In [173]:
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [174]:
# num_pipeline = Pipeline(steps=[
#     ('impute', IterativeImputer(max_iter=10, random_state=42)),
#     ('scaler', StandardScaler())
# ])

In [None]:
cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent', fill_value='unknown')), 
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

In [176]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('num', 'passthrough', NUMERICAL_COLUMNS),
    ('cat_pipeline', cat_pipeline, CATEGORICAL_COLUMNS)
])

preprocessor

In [177]:
from sklearn.linear_model import BayesianRidge

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('imputer', IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42))
])

pipeline

In [178]:
# from sklearn.compose import ColumnTransformer

# col_transformer = ColumnTransformer(transformers=[ 
#     ('cat_pipeline', cat_pipeline, CATEGORICAL_COLUMNS), 
#     ('impute', IterativeImputer(max_iter=10, random_state=42)),
#     ('scaler', StandardScaler())

# ], 
#     remainder='drop',
#     n_jobs=1
# )

# col_transformer

In [179]:
from sklearn.metrics import confusion_matrix, f1_score

def fit_and_print(p, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):

    p.fit(X_train, y_train) 
    train_preds = p.predict(X_train)
    val_preds = p.predict(X_test)
    print("Training Error:")
    print(confusion_matrix(y_train, train_preds))
    print(f1_score(y_train, train_preds))
    # print("-" * 30)
    # print("Validation Error:")
    # print(confusion_matrix(y_val, val_preds))
    # print(f1_score(y_val, val_preds))

def test_predictions(p, X_train=X_train, y_train=y_train, X_test=X_test): 
    p.fit(X_train, y_train) 
    predictions = p.predict(X_test)
    return predictions


In [180]:
# Random Forests 

from sklearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

rfc_pipeline = make_pipeline(pipeline, rfc)

# Predictions 

fit_and_print(rfc_pipeline)



Training Error:
[[518  31]
 [ 60 282]]
0.8610687022900764


In [181]:
#Test 

output = pd.DataFrame({'PassengerId': test_df_raw.PassengerId, 'Survived': test_predictions(rfc_pipeline)})

output.to_csv('submission2.3.csv', index=False)


In [183]:
output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [182]:
# LDA 

from sklearn.pipeline import make_pipeline

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

ldr = LinearDiscriminantAnalysis()

ldr_pipeline = make_pipeline(pipeline, ldr)

# Predictions 

fit_and_print(ldr_pipeline)

Training Error:
[[468  81]
 [100 242]]
0.7278195488721805
