# Example From Pre-processing to Prediction

In [1]:
### General Packages ###
import pandas as pd
import datetime
import altair as alt

### For Model Exporting ###
from joblib import dump, load

### Metrics for Evaluation ###
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score

### For Board Modelling ###
from sklearn.svm import SVC

### Ticket triage functions ###
import sys
sys.path.append("src/Auxiliary/")
sys.path.append("src/Cleaning/")
sys.path.append("src/Model/")
sys.path.append("src/Tokenizer/")

### For pre-processing ###
import ticket_cleaner
import bert_tokenizer

### For Board Modelling ###
import board

### For Severity Modelling ###
import severity

### For Impact Modelling ###
import impact

### For Prs, Modified Accuracy Score ###
import model_functions
import numpy as np

triage_metric = make_scorer(model_functions.modified_accuracy_score, greater_is_better=True)

Compute engine used:  cuda:0


# Import Data

In [2]:
train_set = pd.read_excel("./Data/Tickets with Classifications.xlsx")

# Pre-processing

In [3]:
data = ticket_cleaner.clean_tickets(ticketNbr = train_set.ticketNbr, contact_name = train_set.contact_name, company_name = train_set.company_name, Summary = train_set.Summary, Initial_Description = train_set.Initial_Description, Impact = train_set.SR_Impact_RecID, Severity = train_set.SR_Severity_RecID, Board = train_set.SR_Board_RecID, Source = train_set.Source, date_entered = train_set.date_entered)

# Board Prediction

In [4]:
# Replace this with roBERTa
#Imports
from bert_tokenizer import BERT_Tokenizer
import transformers as ppb
from transformers import DistilBertModel, DistilBertTokenizer

#Loading pre-trained models
max_length = 100
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

#Tokenizers
train, test = train_test_split(data,
                   shuffle = True,
                   train_size = 0.8,
                   random_state = 1)

X_text_train = BERT_Tokenizer(model = model, tokenizer = tokenizer, text = train.combined_text, max_len = max_length)
X_text_test = BERT_Tokenizer(model = model, tokenizer = tokenizer, text = test.combined_text, max_len = max_length)

Total Time (mins): 0:01:35
Total Time (mins): 0:00:24


In [6]:
#Model Board and Predict
Y_train_board = train.Board
Y_test_board = test.Board

board_svm = SVC(C=1, kernel='linear', class_weight='balanced', gamma=0.1)
board_svm.fit(X_text_train,Y_train_board)

board_train = board_svm.predict(X_text_train)
board_test = board_svm.predict(X_text_test)

print("Training: \n",confusion_matrix(Y_train_board,board_train))
print("Testing: \n",confusion_matrix(Y_test_board,board_test))

Training: 
 [[344   0   1]
 [  1 607  33]
 [  9  30 572]]
Testing: 
 [[ 78   4   6]
 [  5 139  28]
 [  8  15 117]]


# Severity and Impact Share these steps

In [7]:
###Typically you would pass Raw text dataframe in but since we are using BERT + SVM as a placehold - we can skip re-tokenizing
X_text_train = pd.DataFrame(X_text_train).assign(brd36 = [0]*len(X_text_train),brd41 = [0]*len(X_text_train),brd43 = [0]*len(X_text_train))
X_text_test = pd.DataFrame(X_text_test).assign(brd36 = [0]*len(X_text_test),brd41 = [0]*len(X_text_test),brd43 = [0]*len(X_text_test))


# Combine Text and OHE Source with Board Predictions 
X_features_train = severity.add_board_predictions(X_text_train, board_predict=board_train)
X_features_test = severity.add_board_predictions(X_text_test, board_predict=board_test)

# Encode Text with BERT
### Normally would need to do this but same reason as above 
#X_text_src_board_train = severity.format_inputs(X_features_train, max_len = 100) 
#X_text_src_board_test = severity.format_inputs(X_features_test, max_len = 100) 

X_text_src_board_train = X_features_train
X_text_src_board_test = X_features_test

X_text_src_board_test = X_text_src_board_test.set_index(test.index)
X_text_src_board_train = X_text_src_board_train.set_index(train.index)

source_board = ["email_connector","deskdirector","email","renewal","escalation"]
for i in source_board:
    X_text_src_board_test[i] = test[i]
    X_text_src_board_train[i] = train[i]


# Severity Prediction

In [8]:
# Get Y Labels
Y_severity_train = train.Severity
Y_severity_test = test.Severity

In [9]:
#Train Model
severity_svm_model, severity_train = severity.train_svm(X_text_src_board_train,Y_severity_train, model_name = "model1", save_model = "N", export_path = "./", verbose=2)

#Use Model to Predict (This is the command tyyo load from joblib file)
#pred_probs = severity.predict_svm(X_predict = X_text_src_board_test, import_path = "./model1.joblib", verbose = 2)

#For the Dashboard
predictions_severity = severity_svm_model.predict(X_text_src_board_test)

print("--Training--")
print(confusion_matrix(Y_severity_train,severity_train.Predict))
print(model_functions.modified_accuracy_score(y_true = Y_severity_train, y_predict = severity_train.Predict))

print("--Testing")
print(confusion_matrix(Y_severity_test,predictions_severity))
print(model_functions.modified_accuracy_score(y_true = Y_severity_test, y_predict = predictions_severity))

--Fitting Model--
--Done--
--Training--
[[647 125  96]
 [ 69 346 128]
 [ 10  21 155]]
0.8772698810269255
--Testing
[[154  45  25]
 [ 20  63  46]
 [  5  13  29]]
0.8425


# Impact Prediction

In [10]:
# Get Y Labels
Y_impact_train = train.Impact
Y_impact_test = test.Impact

In [12]:
#Train Model
impact_svm_model, impact_train = impact.train_svm(X_text_src_board_train,Y_impact_train, model_name = "model1", save_model = "N", export_path = "./", verbose=2)

#Use Model to Predict (This is the command tyyo load from joblib file)
#pred_probs = impact.predict_svm(X_predict = X_text_src_board_test, import_path = "./model1.joblib", verbose = 2)

#For the Dashboard
predictions_impact = impact_svm_model.predict(X_text_src_board_test)

print("--Training--")
print(confusion_matrix(Y_impact_train,impact_train.Predict))
print(model_functions.modified_accuracy_score(y_true = Y_impact_train, y_predict = impact_train.Predict))

print("--Testing")
print(confusion_matrix(Y_impact_test,predictions_impact))
print(model_functions.modified_accuracy_score(y_true = Y_impact_test, y_predict = predictions_impact))

--Fitting Model--
--Done--
--Training--
[[905 173 101]
 [ 26 195  44]
 [ 11  21 121]]
0.9004383218534753
--Testing
[[227  43  27]
 [ 13  41  11]
 [  9  10  19]]
0.8525


# Final Results

In [13]:
train = train.loc[:,["Impact","Severity","Board"]].assign(Board_Predictions = board_train, Severity_Predictions = severity_train.Predict, Impact_Predictions = impact_train.Predict)
train["Subset"] = "Train"
test = test.loc[:,["Impact","Severity","Board"]].assign(Board_Predictions = board_test, Severity_Predictions = predictions_severity, Impact_Predictions = predictions_impact)
test["Subset"] = "Test"
output_df = train.append(test)

In [14]:
train

Unnamed: 0,Impact,Severity,Board,Board_Predictions,Severity_Predictions,Impact_Predictions,Subset
0,0,2,41,41,2,0,Train
813,0,0,43,43,0,0,Train
1152,0,0,43,43,0,0,Train
361,0,0,36,36,0,0,Train
1726,1,1,41,41,2,1,Train
...,...,...,...,...,...,...,...
1791,1,2,41,41,1,2,Train
1096,1,1,41,41,2,1,Train
1932,0,1,41,41,2,1,Train
235,0,2,41,41,2,0,Train


In [15]:
impact_custom_metric = np.mean(cross_val_score(impact_svm_model, X_text_src_board_test, Y_impact_test, scoring=triage_metric, cv=10))
impact_f1_micro = np.mean(cross_val_score(impact_svm_model, X_text_src_board_test, Y_impact_test, scoring="f1_micro", cv=10))
impact_f1_weighted = np.mean(cross_val_score(impact_svm_model, X_text_src_board_test, Y_impact_test, scoring="f1_weighted", cv=10))

severity_custom_metric = np.mean(cross_val_score(severity_svm_model, X_text_src_board_test, Y_severity_test, scoring=triage_metric, cv=10))
severity_f1_micro = np.mean(cross_val_score(severity_svm_model, X_text_src_board_test, Y_severity_test, scoring="f1_micro", cv=10))
severity_f1_weighted = np.mean(cross_val_score(severity_svm_model, X_text_src_board_test, Y_severity_test, scoring="f1_weighted", cv=10))

board_accuracy = accuracy_score(Y_test_board, board_test)
board_f1_score = f1_score(Y_test_board, board_test, average = "weighted")

In [16]:
metrics = [impact_custom_metric
,impact_f1_micro
,impact_f1_weighted
,severity_custom_metric
,severity_f1_micro
,severity_f1_weighted
,board_accuracy
,board_f1_score]

In [17]:
output_metrics = pd.DataFrame(columns = ["impact_custom_metric","impact_f1_micro","impact_f1_weighted","severity_custom_metric","severity_f1_micro","severity_f1_weighted","board_accuracy","board_f1_score"])
output_metrics.loc[0] = [impact_custom_metric ,impact_f1_micro ,impact_f1_weighted ,severity_custom_metric ,severity_f1_micro ,severity_f1_weighted ,board_accuracy ,board_f1_score]
output_metrics

Unnamed: 0,impact_custom_metric,impact_f1_micro,impact_f1_weighted,severity_custom_metric,severity_f1_micro,severity_f1_weighted,board_accuracy,board_f1_score
0,0.81,0.6525,0.686454,0.8175,0.5775,0.590862,0.835,0.835418


In [21]:
list(output_metrics.loc[0])

[0.8100000000000002,
 0.6525000000000001,
 0.6864540441924161,
 0.8175000000000001,
 0.5775,
 0.590862493756491,
 0.835,
 0.8354175667869154]