In [92]:
# Libraries
import os
import sys
import pickle
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif, chi2

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

In [93]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
LOGS_PATH = os.path.join(MODELS_PATH, 'logs')
LOG_CURRENT = os.path.join(LOGS_PATH, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 
                              'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

PROJECT_NAME = '2.1-ie-Linear-SVC-model'
MODEL_NAME = 'LinearSVC-v1.0'

In [94]:
# adding system path
sys.path.insert(0, PROJECT_DIR)

In [95]:
# import internal function
from src.data import process_pipeline, encoder_pipeline, feature_selection_pipeline

In [109]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents

# Function to save a trained model
def save_model(model, model_name, folderPath):
    filename = os.path.join(folderPath, f"{model_name}_model.pkl")
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Function to load a saved model
def load_model(model_name):
    filename = f"{model_name}_model.pkl"
    if os.path.exists(filename):
        with open(filename, 'rb') as file:
            return pickle.load(file)
    else:
        return None

# Function to evaluate a model
def evaluate_model(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)

    # DL model return float type
    if y_pred.dtype != 'int':
        y_pred = y_pred.astype('int')
        
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} - Test Accuracy: {accuracy:.4f}")
    return accuracy

In [97]:
df = pd.read_csv(TEST_DATA_PATH)
df = process_pipeline.process_data(df)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance,CallDuration
0,1,32,management,single,tertiary,0,1218,1,0,1,28,jan,2,-1,0,0,1900-01-01 13:45:20,1900-01-01 13:46:30,0,70.0
1,2,32,blue-collar,married,primary,0,1156,1,0,0,26,may,5,-1,0,0,1900-01-01 14:49:03,1900-01-01 14:52:08,0,185.0
2,3,29,management,single,tertiary,0,637,1,0,1,3,jun,1,119,1,0,1900-01-01 16:30:24,1900-01-01 16:36:04,1,340.0
3,4,25,student,single,primary,0,373,1,0,1,11,may,2,-1,0,0,1900-01-01 12:06:43,1900-01-01 12:20:22,1,819.0
4,5,30,management,married,tertiary,0,2694,0,0,1,3,jun,1,-1,0,0,1900-01-01 14:35:44,1900-01-01 14:38:56,0,192.0


In [98]:
# Get list of categorical & continous variable
categ = get_content(CATEG_PATH)
conti = get_content(CONTI_PATH)

In [99]:
# making sure there's no target
categ

['Age',
 'Job',
 'Marital',
 'Education',
 'Default',
 'HHInsurance',
 'CarLoan',
 'Communication',
 'LastContactDay',
 'LastContactMonth',
 'NoOfContacts',
 'PrevAttempts',
 'Outcome']

In [100]:
# label
X = df.drop(columns=['CarInsurance'])

#target
y = df['CarInsurance']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [101]:
from sklearn.preprocessing import FunctionTransformer

# Function to select non-datetime columns
def select_non_datetime(X):
    return X.select_dtypes(exclude='datetime')

def time_to_numeric(X):
    for col in X:
        if X[col].dtype != 'int':
            X[col] = pd.to_numeric(X[col])
    return X

In [102]:
# Numeric feature processing
numeric_transformer = make_pipeline(
    FunctionTransformer(time_to_numeric, validate=False)
)

# Categorical feature processing
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_transformer = make_pipeline(ordinal_encoder)

# combine pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, conti),
        ("cat", categorical_transformer, categ)
    ]
)

In [103]:
scaler_process = make_pipeline(preprocessor, StandardScaler())

In [104]:
x_train = scaler_process.fit_transform(X_train)
x_test = scaler_process.transform(X_test)

In [105]:
# create a input and output shape
input_shape = np.shape(x_train)[1:]
model_output = 1 # len(np.unique(y_train, axis=0)) # alternative len(df.target.unique)

In [106]:
input_shape

(18,)

In [107]:
model = Sequential()
model.add(Input(shape=input_shape))

model.add(Dense(2048, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(4096, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Dense(model_output, activation='sigmoid'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(
    x_train, y_train,
    epochs=10,
    batch_size=32,
    validation_data=(x_test, y_test)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x17712870550>

In [110]:
evaluate_model(model, 'manual_keras', x_test, y_test)

float32
manual_keras - Test Accuracy: 0.8163


0.81625

In [None]:
def create_model(input_shape, model_output):
    model = Sequential()
    # input layer
    model.add(Input(shape=input_shape))

    
    # layer 1
    model.add(Dense(2048, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    # layer 2
    model.add(Dense(4096, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # output
    model.add(Dense(model_output, activation='sigmoid'))
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'],
                 )
    return model

In [None]:
# call backs
def callbacks(LOGS_PATH, patience):
    tensorboard_callbacks = TensorBoard(log_dir=LOGS_PATH, histogram_freq=1)
    early_callback = EarlyStopping(monitor='accuracy', patience=patience)
    return [early_callback, tensorboard_callbacks]

In [None]:
# callbacks
early_callback, tensorboard_callbacks = callbacks(LOGS_PATH, patience=100)

In [None]:
clf = KerasClassifier(
    build_fn=lambda : create_model(input_shape, model_output),
    epochs=10,
    batch_size=32,
    callbacks=[early_callback, tensorboard_callbacks],
    verbose=0
)

In [None]:
clf.build_fn().summary()

In [None]:
pipeline = make_pipeline(
    preprocessor,
    StandardScaler(),
    clf
)

In [None]:
# model training
pipeline.fit(X_train, y_train)

In [None]:
evaluate_model(pipeline, 'keras', X_test, y_test)