In [14]:
# Libraries
import os
import sys
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import SelectPercentile, f_classif, chi2

In [15]:
CURRENT_DIR = os.getcwd()
PROJECT_DIR = os.path.dirname(CURRENT_DIR)
MODELS_PATH = os.path.join(PROJECT_DIR, 'models')
ENCODER_PATH = os.path.join(MODELS_PATH, 'encoders')
TEST_DATA_PATH = os.path.join(PROJECT_DIR, 
                              'data', 'raw', 'carInsurance_train.csv')

CATEG_PATH = os.path.join(PROJECT_DIR, 'references', 'categorical_columns.txt')
CONTI_PATH = os.path.join(PROJECT_DIR, 'references', 'continous_columns.txt')

PROJECT_NAME = '2.1-ie-Linear-SVC-model'
MODEL_NAME = 'LinearSVC-v1.0'

In [16]:
# adding system path
sys.path.insert(0, PROJECT_DIR)

In [17]:
# import internal function
from src.data import process_pipeline, encoder_pipeline, feature_selection_pipeline

In [18]:
# %% Helper Function
def get_content(txt_file):
    contents = []
    with open(txt_file) as file:
        for line in file:
            contents.append(line.strip())
            
    return contents

In [19]:
df = pd.read_csv(TEST_DATA_PATH)
df = process_pipeline.process_data(df)
df.head()

Unnamed: 0,Id,Age,Job,Marital,Education,Default,Balance,HHInsurance,CarLoan,Communication,LastContactDay,LastContactMonth,NoOfContacts,DaysPassed,PrevAttempts,Outcome,CallStart,CallEnd,CarInsurance,CallDuration
0,1,32,management,single,tertiary,0,1218,1,0,1,28,jan,2,-1,0,0,1900-01-01 13:45:20,1900-01-01 13:46:30,0,70.0
1,2,32,blue-collar,married,primary,0,1156,1,0,0,26,may,5,-1,0,0,1900-01-01 14:49:03,1900-01-01 14:52:08,0,185.0
2,3,29,management,single,tertiary,0,637,1,0,1,3,jun,1,119,1,0,1900-01-01 16:30:24,1900-01-01 16:36:04,1,340.0
3,4,25,student,single,primary,0,373,1,0,1,11,may,2,-1,0,0,1900-01-01 12:06:43,1900-01-01 12:20:22,1,819.0
4,5,30,management,married,tertiary,0,2694,0,0,1,3,jun,1,-1,0,0,1900-01-01 14:35:44,1900-01-01 14:38:56,0,192.0


In [20]:
# Get list of categorical & continous variable
categ = get_content(CATEG_PATH)
conti = get_content(CONTI_PATH)

In [21]:
# label
X = df.drop(columns=['CarInsurance'])

#target
y = df['CarInsurance']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
from sklearn.preprocessing import FunctionTransformer

# Function to select non-datetime columns
def select_non_datetime(X):
    return X.select_dtypes(exclude='datetime')

In [23]:
# Numeric feature processing
numeric_filter = SelectPercentile(f_classif, percentile=50)
numeric_transformer = make_pipeline(
    FunctionTransformer(select_non_datetime, validate=False), 
    numeric_filter
)

# Categorical feature processing
ordinal_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
categorical_filter = SelectPercentile(chi2, percentile=50)
categorical_transformer = make_pipeline(ordinal_encoder, categorical_filter)

# combine pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, conti),
        ("cat", categorical_transformer, categ),
    ]
)

In [24]:
# Function to save a trained model
def save_model(model, model_name, folderPath):
    filename = os.path.join(folderPath, f"{model_name}_model.pkl")
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

# Function to load a saved model
def load_model(model_name):
    filename = f"{model_name}_model.pkl"
    if os.path.exists(filename):
        with open(filename, 'rb') as file:
            return pickle.load(file)
    else:
        return None

# Function to evaluate a model
def evaluate_model(model, model_name, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} - Test Accuracy: {accuracy:.4f}")
    return accuracy

In [25]:
# Initialize a dictionary to store model performances
model_performances = {}

# Model 1: SVC
clf = LinearSVC(
    dual='auto'
)
pipeline_clf = make_pipeline(preprocessor, clf)

# Hyperparameter grid for GridSearchCV
param_grid = {
    'linearsvc__C': [0.1, 1, 10],
    'linearsvc__multi_class': ['ovr', 'crammer_singer'],
    'linearsvc__intercept_scaling': [1, 2, 3]
}

# Perform GridSearchCV for SVC
grid_search = GridSearchCV(
    pipeline_clf, 
    param_grid, 
    cv=5, 
    scoring='accuracy',
    error_score='raise',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

# Save the trained model
save_model(grid_search, MODEL_NAME, MODELS_PATH)

# Evaluate the model on the test set
accuracy = evaluate_model(grid_search, 'LinearSVC', X_test, y_test)

# Track the performance in the dictionary
model_performances['LinearSVC'] = {
    'best_params': grid_search.best_params_,
    'best_accuracy': grid_search.best_score_,
    'test_accuracy': accuracy
}

LinearSVC - Test Accuracy: 0.7937
