# Model Development
Purpose is to create classifier models to serialize for later use. Used in the demonstration of calling the model via HTTPS.<br>
By: Jonathan Lo<br>
Date: 8/14/23

## Overhead

In [1]:
# Imports
import numpy as np
import pandas as pd
import os
from joblib import dump, load

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

from keras.models import Sequential
from keras.layers import Dense
import tensorflow as tf

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from util import *

In [2]:
# Ensuring Models folder exists
if not os.path.exists(r'./../models'):
    os.mkdir(r'./../models')

In [3]:
# Dataset Loading
titanic_df = pd.read_csv("titanic.csv")

# Cleaning & Setup
titanic_df = titanic_df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
categorical_cols = ['Sex', 'Embarked']
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = titanic_df.drop('Survived', axis=1)
y = titanic_df['Survived']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Pipeline Setup
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numerical_transformer, numerical_cols)
    ])

## Random Forrest

In [5]:
# Train RFC
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=1))
])
pipeline.fit(X_train, y_train)

# Get Accuracy
test_accuracy = pipeline.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)

Test set accuracy: 0.8044692737430168


In [6]:
# Seriealize
dump(pipeline, "./../models/RandomForestClassifier.pkl")

['./../models/RandomForestClassifier.pkl']

## AdaBoost

In [7]:
# Train AdaBoost
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', AdaBoostClassifier(random_state=1))
])
pipeline.fit(X_train, y_train)

# Get Accuracy
test_accuracy = pipeline.score(X_test, y_test)
print("Test set accuracy:", test_accuracy)

Test set accuracy: 0.8044692737430168


In [8]:
# Serialize
dump(pipeline, "./../models/AdaBoostClassifier.pkl")

['./../models/AdaBoostClassifier.pkl']

## Support Vector Machine

In [9]:
# Train SVM
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(C=19, random_state=13))
])
pipeline.fit(X_train, y_train)

# Get Accuracy
test_accuracy = pipeline.score(X_test, y_test)
print(test_accuracy)

0.8156424581005587


In [10]:
# Serialize
dump(pipeline, "./../models/SupportVectorClassifier.pkl")

['./../models/SupportVectorClassifier.pkl']

## Gaussian Naive Bayes

In [11]:
# Train Naive Bayes
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])
pipeline.fit(X_train, y_train)

# Get Accuracy
test_accuracy = pipeline.score(X_test, y_test)
print(test_accuracy)

0.776536312849162


In [12]:
# Serialize
dump(pipeline, "./../models/GaussianNaiveBayes.pkl")

['./../models/GaussianNaiveBayes.pkl']

## Gradient Boost Classifier

## Nerual Network (Keras)

In [13]:

# Load the Titanic dataset
titanic_data = pd.read_csv("titanic.csv")

# Preprocess the data
titanic_data = preproccess_nn(titanic_data)

# Split the data into features (X) and target (y)
X = titanic_data.drop("Survived", axis=1)
y = titanic_data["Survived"]

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a simple neural network model
model = Sequential()
model.add(Dense(32, activation="relu", input_dim=X_train.shape[1]))
model.add(Dense(16, activation="sigmoid"))
model.add(Dense(1, activation="sigmoid"))

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)

# Make predictions
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.8156424581005587


In [14]:
# Serialize
dump(model, "./../models/KerasFNN.pkl")

['./../models/KerasFNN.pkl']