In [19]:
import pandas as pd
import numpy as np
import feather

import warnings
warnings.filterwarnings("ignore")

from pycaret.classification import *

In [20]:
train = pd.read_csv('train_df.csv')
test = pd.read_csv('test_df.csv')

In [21]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked', 'Title', 'age_group', 'travel_party', 'not_alone'],
      dtype='object')

In [26]:
# Import PyCaret classification module
from pycaret.classification import *

# Initialize PyCaret setup for classification
# Initialize PyCaret setup, ignoring specific columns
clf = setup(data=train,
    target='Survived',
    ignore_features=['PassengerId', 'age_group', 'Fare', 'Title', 'Embarked', 'SibSp', 'not_alone','Parch'],  # Columns to ignore
    remove_multicollinearity=True,  # Enable multicollinearity removal
    multicollinearity_threshold=0.9,  # Correlation threshold (usually between 0.85 and 0.95)
    session_id=42
)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(891, 13)"
4,Transformed data shape,"(891, 5)"
5,Transformed train set shape,"(623, 5)"
6,Transformed test set shape,"(268, 5)"
7,Ignore features,8
8,Numeric features,3
9,Categorical features,1


In [27]:
# Compare models
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8314,0.8618,0.6946,0.8404,0.7594,0.6316,0.6393,0.079
lightgbm,Light Gradient Boosting Machine,0.8187,0.8478,0.7074,0.8025,0.7492,0.6084,0.6138,0.133
gbc,Gradient Boosting Classifier,0.8123,0.8537,0.6822,0.8064,0.735,0.5917,0.6,0.013
xgboost,Extreme Gradient Boosting,0.8089,0.8329,0.6947,0.7884,0.737,0.5879,0.5923,0.009
qda,Quadratic Discriminant Analysis,0.8074,0.8562,0.7033,0.7784,0.735,0.5851,0.5903,0.006
rf,Random Forest Classifier,0.8041,0.8337,0.7076,0.7698,0.7345,0.5802,0.5844,0.031
ada,Ada Boost Classifier,0.801,0.8248,0.7284,0.7558,0.736,0.5776,0.5834,0.013
lda,Linear Discriminant Analysis,0.7994,0.856,0.6824,0.7704,0.7215,0.5662,0.5706,0.007
ridge,Ridge Classifier,0.7978,0.0,0.6783,0.7695,0.7185,0.5623,0.5671,0.006
nb,Naive Bayes,0.7946,0.8458,0.6866,0.7578,0.7175,0.5574,0.5615,0.007


In [28]:
final_model = finalize_model(best_model)

In [29]:
# Get the processed dataset after PyCaret's setup
processed_data = get_config('X')

# Show the feature names
print(processed_data.columns)

Index(['Pclass', 'Sex', 'Age', 'travel_party'], dtype='object')


In [30]:
# Make predictions on the test set
predictions = predict_model(final_model, data=test)

# View predictions
print(predictions.head())

   PassengerId  Pclass     Sex   Age  SibSp  Parch     Fare Embarked Title  \
0          892       3    male  34.5      0      0   7.8292        Q    Mr   
1          893       3  female  47.0      1      0   7.0000        S   Mrs   
2          894       2    male  62.0      0      0   9.6875        Q    Mr   
3          895       3    male  27.0      0      0   8.6625        S    Mr   
4          896       3  female  22.0      1      1  12.2875        S   Mrs   

     age_group  travel_party  not_alone  prediction_label  prediction_score  
0        Adult             0          1                 0            0.9287  
1        Adult             1          0                 0            0.8384  
2       Senior             0          1                 0            0.7247  
3  Young Adult             0          1                 0            0.7861  
4  Young Adult             2          0                 1            0.5422  


In [32]:
print(predictions.columns)

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked', 'Title', 'age_group', 'travel_party', 'not_alone',
       'prediction_label', 'prediction_score'],
      dtype='object')


In [33]:
# Evaluate on test set
evaluate_model(final_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [34]:
save_model(final_model, 'best_titanic_model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Pclass', 'Age', 'travel_party'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean',
                                                               verbose='deprecated'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None,...
                                                                handle_unknown='value',
                            

In [None]:
from pycaret.classification import save_model
# Save the model as 'best_titanic_model'
save_model(final_model, 'model/best_titanic_model')
