# Base-Line Model Predictions with Pipelines

In [None]:
import pandas as pd 
import numpy as np

In [2]:
train_val_df_raw = pd.read_csv('/Users/thananpornsethjinda/Desktop/internship/projects/titanic-survival-prediction/data/train.csv')
test_df_raw = pd.read_csv('/Users/thananpornsethjinda/Desktop/internship/projects/titanic-survival-prediction/data/test.csv')

In [None]:

def clean_dataframe(df: pd.DataFrame, verbose: bool = True) -> pd.DataFrame:
    df = df.copy()

    # log helper
    def log(msg):
        if verbose:
            print(f"[INFO] {msg}")

    # 1. standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    log("Standardized column names.")

    # 2. remove exact duplicates
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        df.drop_duplicates(inplace=True)
        log(f"Removed {dup_count} duplicate rows.")

    # 3. trim and lowercase all string (object) values
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].astype(str).str.strip().str.lower()
    log("Standardized string columns (lowercase + trimmed).")

    # 4. detect missing values (including blanks and placeholders)
    placeholder_values = ['n/a', 'na', '--', '-', 'none', 'null', '', 'nan']
    df.replace(placeholder_values, np.nan, inplace=True)
    null_report = df.isnull().sum()
    null_report = null_report[null_report > 0]
    if not null_report.empty:
        log(f"Missing values found in columns:\n{null_report}")

    # 5. flag constant columns
    constant_cols = [col for col in df.columns if df[col].nunique() == 1]
    if constant_cols:
        log(f"Constant columns (consider removing): {constant_cols}")

    # 6. flag high cardinality categorical columns
    high_card_cols = [col for col in df.select_dtypes(include='object') if df[col].nunique() > 100]
    if high_card_cols:
        log(f"High-cardinality columns (consider encoding strategies): {high_card_cols}")

    # 7. detect numeric outliers using IQR
    num_cols = df.select_dtypes(include=np.number).columns
    outlier_report = {}
    for col in num_cols:
        q1, q3 = df[col].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        outliers = df[(df[col] < lower) | (df[col] > upper)][col].count()
        if outliers > 0:
            outlier_report[col] = outliers
    if outlier_report:
        log(f"Potential numeric outliers detected:\n{outlier_report}")
    
    # 8. convert applicable columns to category
    for col in df.select_dtypes(include='object'):
        n_unique = df[col].nunique()
        if n_unique < len(df) * 0.05:
            df[col] = df[col].astype('category')
    log("Converted suitable object columns to category dtype.")

    log("Data cleaning complete.")
    return df


In [None]:

train_val_df = clean_dataframe(train_val_df_raw)

print("-" * 100)

test_df = clean_dataframe(test_df_raw)

[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Missing values found in columns:
age         177
cabin       687
embarked      2
dtype: int64
[INFO] High-cardinality columns (consider encoding strategies): ['name', 'ticket', 'cabin']
[INFO] Potential numeric outliers detected:
{'age': np.int64(11), 'sibsp': np.int64(46), 'parch': np.int64(213), 'fare': np.int64(116)}
[INFO] Converted suitable object columns to category dtype.
[INFO] Data cleaning complete.
----------------------------------------------------------------------------------------------------
[INFO] Standardized column names.
[INFO] Standardized string columns (lowercase + trimmed).
[INFO] Missing values found in columns:
age       86
fare       1
cabin    327
dtype: int64
[INFO] High-cardinality columns (consider encoding strategies): ['name', 'ticket']
[INFO] Potential numeric outliers detected:
{'age': np.int64(2), 'sibsp': np.int64(11), 'parch': np.int64(94), 'fare': n

In [4]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(train_val_df, test_size=0.2, random_state=42)

In [None]:
X_train, y_train = train_df[['pclass', 'sex', 'age', 'embarked']], train_df['survived']
X_val, y_val = val_df[['pclass', 'sex', 'age', 'embarked']], val_df['survived']
X_test = test_df[['pclass', 'sex', 'age', 'embarked']]

In [6]:
X_train

Unnamed: 0,pclass,sex,age,embarked
331,1,male,45.5,s
733,2,male,23.0,s
382,3,male,32.0,s
704,3,male,26.0,s
813,3,female,6.0,s
...,...,...,...,...
106,3,female,21.0,s
270,1,male,,s
860,3,male,41.0,s
435,1,female,14.0,s


In [7]:
CATEGORICAL_COLUMNS = ['sex', 'embarked', 'pclass']

NUMERICAL_COLUMNS = ['age']

In [8]:
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from copy import deepcopy

In [9]:
num_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy='mean')), 
    ('scale', StandardScaler())
])

In [10]:
cat_pipeline = Pipeline(steps = [
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot-encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [11]:
from sklearn.compose import ColumnTransformer

col_transformer = ColumnTransformer(transformers = [
    ('num-pipeline', num_pipeline, NUMERICAL_COLUMNS),
    ('cat-pipeline', cat_pipeline, CATEGORICAL_COLUMNS)
], 
    remainder='drop',
    n_jobs=-1                                 
)

In [12]:
from sklearn.metrics import confusion_matrix, f1_score

def fit_and_print(p, X_train=X_train, y_train=y_train, X_val=X_val, y_val=y_val):

    p.fit(X_train, y_train) 
    train_preds = p.predict(X_train)
    val_preds = p.predict(X_val)
    print("Training Error:")
    print(confusion_matrix(y_train, train_preds, normalize='true'))
    print(f1_score(y_train, train_preds))
    print("-" * 30)
    print("Validation Error:")
    print(confusion_matrix(y_val, val_preds, normalize='true'))
    print(f1_score(y_val, val_preds))

In [13]:
from sklearn.pipeline import make_pipeline

In [14]:
### Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver = 'liblinear')

p_lr = make_pipeline(col_transformer, lr)

### Performance 

fit_and_print(p_lr)

Training Error:
[[0.85135135 0.14864865]
 [0.29850746 0.70149254]]
0.7203065134099617
------------------------------
Validation Error:
[[0.83809524 0.16190476]
 [0.27027027 0.72972973]]
0.7448275862068966


In [15]:
### LDA 

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

lda = LinearDiscriminantAnalysis(solver='svd')

p_lda = make_pipeline(col_transformer, lda)

### Performance 

fit_and_print(p_lda)

Training Error:
[[0.85135135 0.14864865]
 [0.31716418 0.68283582]]
0.7079303675048356
------------------------------
Validation Error:
[[0.83809524 0.16190476]
 [0.27027027 0.72972973]]
0.7448275862068966


In [16]:
### Random Forests 

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

p_rfc = make_pipeline(col_transformer, rfc)

### Performance 

fit_and_print(p_rfc)

Training Error:
[[0.95495495 0.04504505]
 [0.19029851 0.80970149]]
0.8594059405940594
------------------------------
Validation Error:
[[0.80952381 0.19047619]
 [0.25675676 0.74324324]]
0.738255033557047


In [17]:
### KNN 

from sklearn.neighbors import KNeighborsClassifier

knnc = KNeighborsClassifier()

p_knnc = make_pipeline(col_transformer, knnc)

### Performance 

fit_and_print(p_knnc)

Training Error:
[[0.90315315 0.09684685]
 [0.2238806  0.7761194 ]]
0.8015414258188824
------------------------------
Validation Error:
[[0.84761905 0.15238095]
 [0.31081081 0.68918919]]
0.723404255319149


In [18]:
test_predictions = p_lr.predict(X_test)

In [19]:
output = pd.DataFrame({'PassengerId': test_df_raw.PassengerId, 'Survived': test_predictions})