In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.pipeline import Pipeline # Import Pipeline
import warnings
warnings.filterwarnings('ignore')

In [None]:
DATA_PATH = 'data.csv' # <-- change to your CSV
TARGET_COL = 'Survived' # <-- change to your target column name
RANDOM_STATE = 42
TEST_SIZE = 0.2

In [None]:
# IQR-based outlier detection (returns boolean mask)
def iqr_outlier_mask(series, k=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return (series < lower) | (series > upper)

# Cap outliers using IQR bounds (winsorize)
def cap_outliers_iqr(df, cols, k=1.5):
    df = df.copy()
    for c in cols:
        if not np.issubdtype(df[c].dtype, np.number):
            continue
        q1 = df[c].quantile(0.25)
        q3 = df[c].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - k * iqr
        upper = q3 + k * iqr
        df[c] = np.where(df[c] < lower, lower, df[c])
        df[c] = np.where(df[c] > upper, upper, df[c])
    return df

# Remove rows with outliers in selected columns
def remove_outliers_iqr(df, cols, k=1.5):
    df = df.copy()
    mask = pd.Series(False, index=df.index)
    for c in cols:
        if not np.issubdtype(df[c].dtype, np.number):
            continue
        mask = mask | iqr_outlier_mask(df[c], k=k)
    return df.loc[~mask]


In [None]:
# VIF calculation (expects only numeric features, no constant column)
def calculate_vif(df_numeric):
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    X = df_numeric.copy()
    # Add small constant if needed
    X = X.fillna(0)
    vif_data = pd.DataFrame()
    vif_data['feature'] = X.columns
    vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    return vif_data

# Remove features by VIF threshold iteratively
def remove_high_vif_features(df_numeric, threshold=5.0):
    df = df_numeric.copy()
    dropped_features = []  # Initialize list to store dropped features
    while True:
        vif = calculate_vif(df)
        max_vif = vif['VIF'].max()
        if max_vif <= threshold:
            break
        feature_to_drop = vif.sort_values('VIF', ascending=False).iloc[0]['feature']
        print(f"Dropping '{feature_to_drop}' with VIF={max_vif:.2f}")
        df = df.drop(columns=[feature_to_drop])
        dropped_features.append(feature_to_drop) # Add dropped feature to list
    return df, dropped_features

In [None]:
def evaluate_logistic(df, features, target, random_state=RANDOM_STATE, scale=True, C=1.0):
    # Expects df contains target (no NaNs in target)
    X = df[features].copy()
    y = df[target].copy().astype(int)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=random_state, stratify=y
    )
    steps = []
    if scale:
        steps.append(('scaler', StandardScaler()))
    steps.append(('clf', LogisticRegression(max_iter=2000, C=C)))
    pipe = Pipeline(steps)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    probs = pipe.predict_proba(X_test)[:,1] if hasattr(pipe, "predict_proba") else None

    results = {
        'accuracy': accuracy_score(y_test, preds),
        'roc_auc': roc_auc_score(y_test, probs) if probs is not None and len(np.unique(y_test))>1 else None,
        'classification_report': classification_report(y_test, preds, digits=4)
    }
    return results

def print_eval(name, eval_dict):
    print(f"\n---- {name} ----")
    print("Accuracy :", round(eval_dict['accuracy'], 4))
    if eval_dict['roc_auc'] is not None:
        print("ROC AUC  :", round(eval_dict['roc_auc'], 4))
    print("Classification report:\n", eval_dict['classification_report'])

In [None]:
# Main
if __name__ == '__main__':
  # Load data
  df = pd.read_csv("/content/Titanic_test.csv")
  df1 = pd.read_csv("/content/Titanic_train.csv")
  display(df.head())
  display(df1.head())

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
df_train = pd.read_csv("/content/Titanic_train.csv")
df_test = pd.read_csv("/content/Titanic_test.csv")

df_combined = pd.concat([df_train, df_test], ignore_index=True)

display(df_combined)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
1304,1305,,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
1305,1306,,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
1306,1307,,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
1307,1308,,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [None]:
#To check dataframe
df_combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 122.8+ KB


In [None]:
#Calculate Statistics for numerical columns
df_combined.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,1309.0,891.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,0.383838,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.486592,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,0.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,1.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,1.0,3.0,80.0,8.0,9.0,512.3292


In [None]:
print("\nMissing value counts (combined):")
print(df.isnull().sum().sort_values(ascending=False))



Missing value counts (combined):
Cabin          327
Age             86
Fare             1
Name             0
Pclass           0
PassengerId      0
Sex              0
Parch            0
SibSp            0
Ticket           0
Embarked         0
dtype: int64


In [None]:
#To display the rows that have atleast 1 missing values
#df_combined[df_combined.isnull().any(axis=1)]


In [None]:
#removing rows that have atleast 1 missing value across any column
df_combined = df_combined.dropna()
df_combined.shape

(183, 12)

In [None]:
df_prep = df.copy()

In [None]:
# HasCabin flag
df_prep['HasCabin'] = df_prep['Cabin'].notna().astype(int)

# Drop raw Cabin column
if 'Cabin' in df_prep.columns:
    df_prep = df_prep.drop(columns=['Cabin'])

In [None]:
# Impute Age (median).
# Option: median by Pclass+Sex for better imputation
df_prep['Age'] = df_prep['Age'].fillna(df_prep['Age'].median())

In [None]:
# Imput Fare
if 'Fare' in df_prep.columns:
  df_prep['Fare'] = df_prep['Fare'].fillna(df_prep['Fare'].median())

In [None]:
# Impute Embarked(mode)
if 'Embarked' in df_prep.columns:
  df_prep['Embarked'] = df_prep['Embarked'].fillna(df_prep['Embarked'].mode()[0])

# Drop columns not used
to_drop = [c for c in ['PassengerID', 'Name', 'Ticket'] if c in df_prep.columns]
df_prep = df_prep.drop(columns=to_drop)

In [None]:
# One-hot encode categorical columns
categorical = []
if 'Sex' in df_prep.columns:
  categorical.append('Sex')
if 'Embarked' in df_prep.columns:
  categorical.append('Embarked')
df_prep = pd.get_dummies(df_prep, columns=categorical, drop_first=True)

In [None]:
# Prepare dataset for modeling (use only rows with Survived not null)

# Apply the same preprocessing steps to df_train as applied to df_test (df_prep)
df_train_preprocessed = df_train.copy()

# HasCabin flag
df_train_preprocessed['HasCabin'] = df_train_preprocessed['Cabin'].notna().astype(int)

# Drop raw Cabin column
if 'Cabin' in df_train_preprocessed.columns:
    df_train_preprocessed = df_train_preprocessed.drop(columns=['Cabin'])

# Impute Age (median).
df_train_preprocessed['Age'] = df_train_preprocessed['Age'].fillna(df_train_preprocessed['Age'].median())

# Impute Embarked (mode)
if 'Embarked' in df_train_preprocessed.columns:
  df_train_preprocessed['Embarked'] = df_train_preprocessed['Embarked'].fillna(df_train_preprocessed['Embarked'].mode()[0])

# Drop columns not used
to_drop = [c for c in ['PassengerId', 'Name', 'Ticket'] if c in df_train_preprocessed.columns]
df_train_preprocessed = df_train_preprocessed.drop(columns=to_drop)

# One-hot encode categorical columns
categorical = []
if 'Sex' in df_train_preprocessed.columns:
  categorical.append('Sex')
if 'Embarked' in df_train_preprocessed.columns:
  categorical.append('Embarked')
df_train_preprocessed = pd.get_dummies(df_train_preprocessed, columns=categorical, drop_first=True)


df_labeled = df_train_preprocessed[df_train_preprocessed[TARGET_COL].notna()].copy()
df_labeled[TARGET_COL] = df_labeled[TARGET_COL].astype(int)  # ensure int

print("\nLabeled rows (used for training):", df_labeled.shape)

# Identify numeric feature columns for model (exclude TARGET_COL)
feature_cols = [c for c in df_labeled.columns if c != TARGET_COL and np.issubdtype(df_labeled[c].dtype, np.number)]
print("\nNumeric feature columns used for modeling:")
print(feature_cols)


Labeled rows (used for training): (891, 10)

Numeric feature columns used for modeling:
['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'HasCabin']


In [None]:
# Baseline evaluation (before outlier/VIF handling)

baseline_eval = evaluate_logistic(df_labeled, feature_cols, TARGET_COL)
print_eval("Baseline (before outlier/VIF changes)", baseline_eval)

# Save baseline metrics for comparison
results_summary = []
results_summary.append({
    'stage': 'baseline',
    'accuracy': baseline_eval['accuracy'],
    'roc_auc': baseline_eval['roc_auc'],
    'features': feature_cols.copy()
})



---- Baseline (before outlier/VIF changes) ----
Accuracy : 0.6704
ROC AUC  : 0.6886
Classification report:
               precision    recall  f1-score   support

           0     0.6861    0.8545    0.7611       110
           1     0.6190    0.3768    0.4685        69

    accuracy                         0.6704       179
   macro avg     0.6526    0.6157    0.6148       179
weighted avg     0.6603    0.6704    0.6483       179



In [None]:
# Outlier treatment

df_capped = cap_outliers_iqr(df_labeled, feature_cols, k=1.5)
capped_eval = evaluate_logistic(df_capped, feature_cols, TARGET_COL)
print_eval("After capping outliers (IQR winsorize)", capped_eval)
results_summary.append({
    'stage': 'cap_outliers',
    'accuracy': capped_eval['accuracy'],
    'roc_auc': capped_eval['roc_auc'],
    'features': feature_cols.copy()
})


---- After capping outliers (IQR winsorize) ----
Accuracy : 0.676
ROC AUC  : 0.6732
Classification report:
               precision    recall  f1-score   support

           0     0.6940    0.8455    0.7623       110
           1     0.6222    0.4058    0.4912        69

    accuracy                         0.6760       179
   macro avg     0.6581    0.6256    0.6268       179
weighted avg     0.6663    0.6760    0.6578       179



In [None]:
#  Multicollinearity (VIF) check on original numeric features

vif_before = calculate_vif(df_labeled[feature_cols])
print("\nVIF before removal (top 15):\n", vif_before.sort_values('VIF', ascending=False).head(15))

# Remove high VIF features iteratively (use threshold 5.0)
vif_threshold = 5.0
reduced_df, dropped_features = remove_high_vif_features(df_labeled[feature_cols], threshold=vif_threshold)
reduced_features = reduced_df.columns.tolist()

print("\nFeatures dropped by VIF removal:", dropped_features)
print("Features kept after VIF removal:", reduced_features)


VIF before removal (top 15):
     feature       VIF
1       Age  4.866118
0    Pclass  4.141126
5  HasCabin  2.031801
4      Fare  1.994727
2     SibSp  1.559542
3     Parch  1.549741

Features dropped by VIF removal: []
Features kept after VIF removal: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'HasCabin']


In [None]:
vif_reduced_eval = evaluate_logistic(df_labeled, reduced_features, TARGET_COL)
print_eval(f"After VIF-based feature removal (threshold={vif_threshold})", vif_reduced_eval)
results_summary.append({
    'stage': f'vif_removed_{vif_threshold}',
    'accuracy': vif_reduced_eval['accuracy'],
    'roc_auc': vif_reduced_eval['roc_auc'],
    'features': reduced_features.copy()
})




---- After VIF-based feature removal (threshold=5.0) ----
Accuracy : 0.6704
ROC AUC  : 0.6886
Classification report:
               precision    recall  f1-score   support

           0     0.6861    0.8545    0.7611       110
           1     0.6190    0.3768    0.4685        69

    accuracy                         0.6704       179
   macro avg     0.6526    0.6157    0.6148       179
weighted avg     0.6603    0.6704    0.6483       179



In [None]:
#  Combined: cap outliers AND remove high-VIF features then evaluate

df_combined = cap_outliers_iqr(df_labeled, feature_cols, k=1.5)
# compute VIF on df_combined numeric columns and remove
numeric_combined = [c for c in df_combined.columns if c != TARGET_COL and np.issubdtype(df_combined[c].dtype, np.number)]
reduced_df_combined, dropped_combined = remove_high_vif_features(df_combined[numeric_combined], threshold=vif_threshold)
final_features_combined = reduced_df_combined.columns.tolist()

combined_eval = evaluate_logistic(df_combined, final_features_combined, TARGET_COL)
print_eval("Combined: cap outliers + VIF-based removal", combined_eval)
results_summary.append({
    'stage': 'combined_cap_vif',
    'accuracy': combined_eval['accuracy'],
    'roc_auc': combined_eval['roc_auc'],
    'features': final_features_combined.copy()
})

Dropping 'Age' with VIF=5.63

---- Combined: cap outliers + VIF-based removal ----
Accuracy : 0.6425
ROC AUC  : 0.6746
Classification report:
               precision    recall  f1-score   support

           0     0.6716    0.8182    0.7377       110
           1     0.5556    0.3623    0.4386        69

    accuracy                         0.6425       179
   macro avg     0.6136    0.5903    0.5882       179
weighted avg     0.6269    0.6425    0.6224       179



In [None]:
#  Summary table of results

summary_df = pd.DataFrame([{
    'stage': r['stage'],
    'accuracy': r['accuracy'],
    'roc_auc': r['roc_auc'],
    'n_features': len(r['features'])
} for r in results_summary])

print("\n=== Summary of evaluations ===")
print(summary_df.sort_values('stage'))


=== Summary of evaluations ===
              stage  accuracy   roc_auc  n_features
0          baseline  0.670391  0.688603           6
1      cap_outliers  0.675978  0.673188           6
3  combined_cap_vif  0.642458  0.674572           5
2   vif_removed_5.0  0.670391  0.688603           6


In [None]:
# Final notes & save cleaned dataset

# Save the preprocessed combined dataset (optional)
cleaned_path = "/content/Titanic_preprocessed_combined.csv"
df_prep.to_csv(cleaned_path, index=False)
print(f"\nSaved preprocessed combined dataset to: {cleaned_path}")

# Print final recommended features (combined option)
print("\nFinal recommended features (combined pipeline):")
print(final_features_combined)


Saved preprocessed combined dataset to: /content/Titanic_preprocessed_combined.csv

Final recommended features (combined pipeline):
['Pclass', 'SibSp', 'Parch', 'Fare', 'HasCabin']


In [None]:
import joblib
final_features = reduced_df.columns.tolist()  # from previous step

X_final = df_labeled[final_features]
y_final = df_labeled['Survived']

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

final_pipe = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=2000))])
final_pipe.fit(X_final, y_final)

artifact = {'pipeline': final_pipe, 'features': final_features}
joblib.dump(artifact, '/content/model.pkl')   # in Colab; use /mnt/data/model.pkl in other env
print("Saved model.pkl to /content/model.pkl")
print("Final features saved:", final_features)


Saved model.pkl to /content/model.pkl
Final features saved: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'HasCabin']


**1. What is the difference between precision and recall?**

- Precision = TP / (TP + FP). It measures how many of the positive predictions are actually positive. High precision means when the model predicts "positive", it's usually correct.

- Recall (Sensitivity) = TP / (TP + FN). It measures how many of the actual positive cases the model correctly identified. High recall means the model finds most of the positive cases. Tradeoff: increasing recall often lowers precision and vice versa. Choose based on whether false positives or false negatives are more costly

**2. What is cross-validation, and why is it important in binary classification?**

Cross-validation is a method to estimate the generalization performance of a model by splitting the dataset into multiple train/test folds (e.g., k-fold CV) and averaging performance across folds.
Its important because it gives a more robust estimate of model performance than a sigle train/test split ,helps detect overfitting, and can be used to tune hyperparameters reliably.