# Model Development
By: Jonathan Lo<br>
Date: 8/14/23

## Overhead

In [1]:
# Imports
import pandas as pd
import os

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from joblib import dump, load

In [2]:
# Ensuring Models folder exists
if not os.path.exists(r'./../models'):
    os.mkdir(r'./../models')

In [3]:
# Dataset Loading
titanic_df = pd.read_csv("titanic.csv")

# Cleaning & Setup
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
categorical_cols = ['Sex', 'Embarked']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Pipeline Setup
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

## Random Forrest

In [5]:
# Train RFC
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=1))
])
pipeline.fit(X_train, y_train)

# Get Accuracy
test_accuracy = pipeline.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)

Test set accuracy: 0.8044692737430168


In [6]:
# Seriealize
dump(pipeline, "./../models/RandomForestClassifier.pkl")

['./../models/RandomForestClassifier.pkl']

In [35]:
import requests

res = requests.post(
    url="http://127.0.0.1:5000/api/model",
    params = {
        "model": "AdaBoostClassifier.pkl"
    },
    headers = {
        "Content-Type": "application/json"
    },
    json = {
        "PassengerId": "7",
        "Pclass": "1",
        "Name": "McCarthy, Mr. Timothy J",
        "Sex": "male",
        "Age": "54",
        "SibSp": "0",
        "Parch": "0",
        "Ticket": "17463",
        "Fare": "51.8625",
        "Cabin": "E46",
        "Embarked": "S"
    }
)

{'message': 'Model prediction for model: AdaBoostClassifier.pkl', 'prediction': 0, 'success': 1}


## AdaBoost

In [7]:
# Train AdaBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=1))
])
pipeline.fit(X_train, y_train)

# Get Accuracy
test_accuracy = pipeline.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)

Test set accuracy: 0.8044692737430168


In [8]:
# Serialize
dump(pipeline, "./../models/AdaBoostClassifier.pkl")

['./../models/AdaBoostClassifier.pkl']