Case Management System

In [1]:
# load dependencies
import os
path="C:/Users/KganoM/Downloads/doc"    # set your local directory
os.chdir(path)

import warnings
warnings.filterwarnings("ignore") 

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn import tree
from matplotlib import pyplot as plt
from joblib import dump

In [2]:
# import data
cases_table = pd.read_excel("Cases_Table.xlsx")
clients_table = pd.read_excel("Clients_Table.xlsx")

In [None]:
# 1. Data Preprocessing

# merge and sort data

df = pd.merge(cases_table, clients_table, on='client_ID', how='inner')
df = df.drop_duplicates()

df['resolution_time'] = pd.to_datetime(df['resolution_time'], format='%d-%m-%Y %H:%M:%S')
df = df.sort_values(by='resolution_time')   # sort by date in order to perform a time-based train-test split 

df.shape   # view dimensions of dataset
df.head()  # preview the dataset
df.info()  # view summary of the dataset

# handle missing values

df.isnull().sum()    # check for missing values in predivction input varibales
# -- prediction input variables: case_type, age, risk_level, previous_cases
# -- none of the inputs have missing values and therefore no tasks is required to exclude or impute missing values

# categorical encoding

df['case_type_dummy'] = np.where(df['case_type']=='civil', 1, 0)       # create dummy variable for categorical inputs. A 1/0 is encoding is beeter is you jhave less categorical lists
df['previous_cases_dummy'] = np.where(df['previous_cases']=='Y', 1, 0)

df.to_excel('./API/MLDeployment/data/model_data.xlsx', sheet_name='Sheet1', index=False)  # export detail model data

# normalize/scale features 
# -- this technique is not required for decision tree models. It is commonly used when training a neural network for better performance

In [None]:
# 2. Model Selection

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
train_set, test_set= np.split(df, [int(.70 *len(df))])  # That makes the train_set with the first 70% of the data, and the test_set with rest 30% of the data.

X_train = train_set.loc[:, ['case_type_dummy','age','risk_level','previous_cases_dummy']]
X_test = test_set.loc[:, ['case_type_dummy','age','risk_level','previous_cases_dummy']]

y_train = train_set['outcome']
y_test = test_set['outcome']

X_train.shape, X_test.shape  # check shape of splitted data

model = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0) 
model.fit(X_train, y_train)  

y_train_pred = model.predict(X_train) # prediction model on train set

y_test_pred = model.predict(X_test)   # prediction model on test set

In [None]:
# 3. Model Evaluation

accuracy_train = accuracy_score(y_train, y_train_pred)
precision_train = precision_score(y_train, y_train_pred, pos_label='positive', average='micro')
recall_train = recall_score(y_train, y_train_pred, pos_label='positive', average='micro')
f1_train = f1_score(y_train, y_train_pred, pos_label='positive', average='micro')

accuracy_test = accuracy_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, pos_label='positive', average='micro')
recall_test = recall_score(y_test, y_test_pred, pos_label='positive', average='micro')
f1_test = f1_score(y_test, y_test_pred, pos_label='positive', average='micro')

print('Train:', accuracy_train)
print('Train:', precision_train)
print('Train:', recall_train)
print('Train:', f1_train)

print('Test:', accuracy_test)
print('Test:', precision_test)
print('Test:', recall_test)
print('Test:', f1_test)

# -- alignment in model performance between train and test sets is an indication of model with better Generalization

dump(model, './API/MLDeployment/data/model.joblib')  # save the model for deployment as an API

In [None]:
# 4. Feature Importance Analysis

# -- Decision trees, such as Classification and Regression Trees (CART), 
# -- calculate feature importance based on the reduction in a criterion (e.g., Gini impurity or entropy) used to select split points

importance = model.feature_importances_

for i, v in enumerate(importance):
    print(f'Feature: {i}, Score: {v:.5f}')
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
# 5. Deploy the model as an API using Flask or FastAPI. 

# -- open vs code
# -- cd C:\Users\KganoM\Downloads\doc\API
# -- organize app.py file with python method and index.html file for ML model deployment as an API
# -- pip install virtualenv
# -- python -m venv myenv
# -- Set-ExecutionPolicy Unrestricted -Scope Process  (optional)
# -- myenv\Scripts\activate
# -- pip install flask, pandas, scikit-learn
# -- cd MLDeployment
# -- Run the project using 'python app.py' and navigate to 127.0.0.1:5000 in your browser.