In [215]:
# Libraries
import os
import sys
import pickle
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif, chi2

import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Conv1D, Reshape, Flatten

In [216]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
LOGS_PATH = os.path.join(MODELS_PATH, 'logs')
LOG_CURRENT = os.path.join(LOGS_PATH, datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 
                              'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

PROJECT_NAME = '2.1-ie-Linear-SVC-model'
MODEL_NAME = 'LinearSVC-v1.0'

In [217]:
# adding system path
sys.path.insert(0, PROJECT_DIR)

In [218]:
# import internal function
from src.data import process_pipeline, encoder_pipeline, feature_selection_pipeline

In [219]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents

# Function to save a trained model
def save_model(model, model_name, folderPath):
    filename = os.path.join(folderPath, f"{model_name}_model.pkl")
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Function to load a saved model
def load_model(model_name):
    filename = f"{model_name}_model.pkl"
    if os.path.exists(filename):
        with open(filename, 'rb') as file:
            return pickle.load(file)
    else:
        return None

# Function to evaluate a model
def evaluate_model(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} - Test Accuracy: {accuracy:.4f}")
    return accuracy

In [220]:
df = pd.read_csv(TEST_DATA_PATH)
df = process_pipeline.process_data(df)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance,CallDuration
0,1,32,management,single,tertiary,0,1218,1,0,1,28,jan,2,-1,0,0,1900-01-01 13:45:20,1900-01-01 13:46:30,0,70.0
1,2,32,blue-collar,married,primary,0,1156,1,0,0,26,may,5,-1,0,0,1900-01-01 14:49:03,1900-01-01 14:52:08,0,185.0
2,3,29,management,single,tertiary,0,637,1,0,1,3,jun,1,119,1,0,1900-01-01 16:30:24,1900-01-01 16:36:04,1,340.0
3,4,25,student,single,primary,0,373,1,0,1,11,may,2,-1,0,0,1900-01-01 12:06:43,1900-01-01 12:20:22,1,819.0
4,5,30,management,married,tertiary,0,2694,0,0,1,3,jun,1,-1,0,0,1900-01-01 14:35:44,1900-01-01 14:38:56,0,192.0


In [221]:
# Get list of categorical & continous variable
categ = get_content(CATEG_PATH)
conti = get_content(CONTI_PATH)

In [222]:
# making sure there's no target
categ

['Age',
 'Job',
 'Marital',
 'Education',
 'Default',
 'HHInsurance',
 'CarLoan',
 'Communication',
 'LastContactDay',
 'LastContactMonth',
 'NoOfContacts',
 'PrevAttempts',
 'Outcome']

In [223]:
# label
X = df.drop(columns=['CarInsurance'])

#target
y = df['CarInsurance']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [224]:
from sklearn.preprocessing import FunctionTransformer

# Function to select non-datetime columns
def select_non_datetime(X):
    return X.select_dtypes(exclude='datetime')

def time_to_numeric(X):
    for col in X:
        if X[col].dtype != 'int':
            X[col] = pd.to_numeric(X[col])
    return X

def data_CNN(X):
    return np.expand_dims(X, axis=-1)

In [225]:
# Numeric feature processing
numeric_transformer = make_pipeline(
    FunctionTransformer(time_to_numeric, validate=False)
)

# Categorical feature processing
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_transformer = make_pipeline(
    ordinal_encoder
)

# combine pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, conti),
        ("cat", categorical_transformer, categ)
    ]
)

In [226]:
X_temp = preprocessor.fit_transform(X_train)

In [227]:
# create a input and output shape
input_shape = np.shape(X_temp)[1:]
model_output = 1 # len(np.unique(y_train, axis=0)) # alternative len(df.target.unique)

In [228]:
input_shape

(18,)

In [229]:
def create_model(input_shape, model_output):
    model = Sequential()
    # input layer
    model.add(Input(shape=input_shape))

    # Reshape for Conv1D
    model.add(Reshape((input_shape[0], 1)))
    
    # CNN
    model.add(Conv1D(64, kernel_size=2, activation='relu'))
    model.add(Conv1D(128, kernel_size=2, activation='relu'))
    model.add(Flatten())
    model.add(Dropout(0.5))

    # layer 1
    model.add(Dense(128, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # output
    model.add(Dense(model_output, activation='sigmoid'))
    model.compile(optimizer='adam', 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'],
                 )
    return model

In [230]:
# call backs
def callbacks(LOGS_PATH, patience):
    tensorboard_callbacks = TensorBoard(log_dir=LOGS_PATH, histogram_freq=1)
    early_callback = EarlyStopping(monitor='accuracy', patience=patience)
    return [early_callback, tensorboard_callbacks]

In [231]:
# callbacks
early_callback, tensorboard_callbacks = callbacks(LOGS_PATH, patience=100)

In [232]:
clf = KerasClassifier(
    build_fn=lambda : create_model(input_shape, model_output),
    epochs=10,
    batch_size=32,
    callbacks=[early_callback, tensorboard_callbacks],
    verbose=0
)

  clf = KerasClassifier(


In [233]:
clf.build_fn().summary()

Model: "sequential_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape_22 (Reshape)        (None, 18, 1)             0         
                                                                 
 conv1d_48 (Conv1D)          (None, 17, 64)            192       
                                                                 
 conv1d_49 (Conv1D)          (None, 16, 128)           16512     
                                                                 
 flatten_22 (Flatten)        (None, 2048)              0         
                                                                 
 dropout_44 (Dropout)        (None, 2048)              0         
                                                                 
 dense_42 (Dense)            (None, 128)               262272    
                                                                 
 batch_normalization_16 (Bat  (None, 128)            

In [234]:
pipeline = make_pipeline(
    preprocessor,
    StandardScaler(),
    FunctionTransformer(data_CNN, validate=False),
    clf
)

In [235]:
# model training
pipeline.fit(X_train, y_train)

In [236]:
evaluate_model(pipeline, 'keras', X_test, y_test)

keras - Test Accuracy: 0.8225


0.8225