In [1]:
# Libraries
import os
import sys
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_classif, chi2

In [2]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
PIPELINES_PATH = os.path.join(MODELS_PATH, 'pipelines')
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 
                              'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

MODEL_NAME = 'LinearSVC-v1.0'
DATA_PIPELINE_V1 = os.path.join(PIPELINES_PATH, 'data-pipeline-v1.0')

In [3]:
# adding system path
sys.path.insert(0, PROJECT_DIR)

In [4]:
# import internal function
from src.data import process_pipeline, encoder_pipeline, feature_selection_pipeline

In [5]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents
    
# Function to save a trained model
def save_model(model, model_name, folderPath):
    filename = os.path.join(folderPath, f"{model_name}.pkl")
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Function to load a saved model
def load_model(model_name):
    filename = f"{model_name}.pkl"
    if os.path.exists(filename):
        with open(filename, 'rb') as file:
            return pickle.load(file)
    else:
        return None

# Function to evaluate a model
def evaluate_model(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} - Test Accuracy: {accuracy:.4f}")
    return accuracy

In [6]:
df = pd.read_csv(TEST_DATA_PATH)
df = process_pipeline.process_data(df)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,...,AgeGroup,negativeBalance,BalanceFlag,HasCommuncation,SinLastContactMonth,CosLastContactMonth,CallDuration,CallCategory,CallFlag,Outcome_Simplify
0,1,32,management,single,tertiary,0,1218,1,0,telephone,...,30-39,0,0,1,0.5,0.866025,70.0,Afternoon,0,0
1,2,32,blue-collar,married,primary,0,1156,1,0,No Communication,...,30-39,0,0,0,0.5,-0.866025,185.0,Afternoon,1,0
2,3,29,management,single,tertiary,0,637,1,0,cellular,...,18-29,0,0,1,1.224647e-16,-1.0,340.0,Afternoon,1,0
3,4,25,student,single,primary,0,373,1,0,cellular,...,18-29,0,0,1,0.5,-0.866025,819.0,Afternoon,1,0
4,5,30,management,married,tertiary,0,2694,0,0,cellular,...,30-39,0,1,1,1.224647e-16,-1.0,192.0,Afternoon,1,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Id                   4000 non-null   int64   
 1   Age                  4000 non-null   int64   
 2   Job                  4000 non-null   category
 3   Marital              4000 non-null   category
 4   Education            4000 non-null   category
 5   Default              4000 non-null   int64   
 6   Balance              4000 non-null   int64   
 7   HHInsurance          4000 non-null   int64   
 8   CarLoan              4000 non-null   int64   
 9   Communication        4000 non-null   category
 10  LastContactDay       4000 non-null   int64   
 11  LastContactMonth     4000 non-null   int64   
 12  NoOfContacts         4000 non-null   int64   
 13  DaysPassed           4000 non-null   int64   
 14  PrevAttempts         4000 non-null   int64   
 15  Outcome              

In [8]:
# Get list of categorical & continous variable
categ = get_content(CATEG_PATH)
conti = get_content(CONTI_PATH)

In [9]:
# label
X = df.drop(columns=['CarInsurance'])

# load previous data pipeline - if none return means something is wrong
data_pipeline = load_model(DATA_PIPELINE_V1)
X = pd.DataFrame(data_pipeline.transform(X), columns=X.columns.tolist())

#target
y = df['CarInsurance']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
# Initialize a dictionary to store model performances
model_performances = {}

# Model 1: SVC
clf = LinearSVC(
    dual='auto'
)
pipeline_clf = make_pipeline(
    SelectKBest(), 
    clf
)

pipeline_clf.fit(X_train, y_train)
pipeline_clf.named_steps['selectkbest'].get_feature_names_out()

array(['Education', 'Communication', 'LastContactDay', 'Outcome',
       'CallEnd', 'AgeGroup', 'negativeBalance', 'BalanceFlag',
       'CallDuration', 'CallCategory'], dtype=object)

In [11]:
# Hyperparameter grid for GridSearchCV
param_grid = {
    'linearsvc__C': np.arange(0.1, 0.9, 0.1),
    'linearsvc__loss': ['hinge', 'squared_hinge'],
    'linearsvc__intercept_scaling': np.arange(1, 9, 1),
}

# Perform GridSearchCV for SVC
grid_search = GridSearchCV(
    pipeline_clf, 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    error_score='raise',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Save the trained model
save_model(grid_search, MODEL_NAME, MODELS_PATH)

# Evaluate the model on the test set
accuracy = evaluate_model(grid_search, 'LinearSVC', X_test, y_test)

# Track the performance in the dictionary
model_performances['LinearSVC'] = {
    'best_params': grid_search.best_params_,
    'best_accuracy': grid_search.best_score_,
    'test_accuracy': accuracy
}

LinearSVC - Test Accuracy: 0.8075
