# Example From Pre-processing to Prediction

In [2]:
### General Packages ###
import pandas as pd
import datetime
import altair as alt

### For Model Exporting ###
from joblib import dump, load

### Metrics for Evaluation ###
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

### For Board Modelling ###
from sklearn.svm import SVC

### Ticket triage functions ###
import sys
sys.path.append("src/Auxiliary/")
sys.path.append("src/Cleaning/")
sys.path.append("src/Model/")
sys.path.append("src/Tokenizer/")

### For pre-processing ###
import ticket_cleaner
import bert_tokenizer

### For Board Modelling ###
import board

### For Severity Modelling ###
import severity

### For Impact Modelling ###
import impact

Compute engine used:  cpu


# Import Data

In [3]:
train_set = pd.read_excel("Data/Tickets with Classifications.xlsx")

# Pre-processing

In [3]:
data = ticket_cleaner.clean_tickets(ticketNbr = train_set.ticketNbr, contact_name = train_set.contact_name, company_name = train_set.company_name, Summary = train_set.Summary, Initial_Description = train_set.Initial_Description, Impact = train_set.SR_Impact_RecID, Severity = train_set.SR_Severity_RecID, Board = train_set.SR_Board_RecID, Source = train_set.Source, date_entered = train_set.date_entered)
train, test = train_test_split(data,
                   shuffle = True,
                   train_size = 0.8,
                   random_state = 1)

# Board Prediction

In [4]:
# Roberta processing/tokenization of input data
train_iter, valid_iter = board.format_inputs(train, split = 0.1)

In [5]:
# Train the Model
# Input data are training and validation iterators
# Results stored internally as .pkl file
# Training the model on 2K entries older dataset
### What worked best:
# pretrain_epoch = 6
# train_epoch = 12
board.train_roberta(train_iter, valid_iter,model_path = "./Saved_Models/Board",pretrain_epoch = 1, train_epoch = 2)

Downloading: 100%|██████████| 481/481 [00:00<00:00, 498kB/s]
Downloading: 100%|██████████| 501M/501M [00:14<00:00, 35.3MB/s]
Epoch [1/1], global step [90/90], PT Loss: 1.0891, Val Loss: 1.2683
Pre-training done!
Epoch [1/2], global step [90/180], Train Loss: 1.1436, Valid Loss: 1.2608


FileNotFoundError: [Errno 2] No such file or directory: './Saved_Models/Board/model.pkl'

In [29]:
# Predict the board classification for new data.
# Input data is cleaned text as pandas dataframe
# Testing the model on 4K entries newer dataset.
board_results = board.predict_roberta(df = test,path="./Saved_Models/Board/model.pkl")


=====Prediction Metrics=====
Classification Report:
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        88
           1     0.4650    0.9651    0.6276       172
           2     0.6977    0.2143    0.3279       140

    accuracy                         0.4900       400
   macro avg     0.3876    0.3931    0.3185       400
weighted avg     0.4441    0.4900    0.3846       400

Confusion Matrix:
[[  0  81   7]
 [  0 166   6]
 [  0 110  30]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Severity Prediction

In [None]:
# Combine Text and OHE Source with Board Predictions 
import severity

X_features_train = severity.add_board_predictions(X_train, board_predict = board_train)
X_features_test = severity.add_board_predictions(X_test, board_predict = board_test)

# Get Y Labels
Y_severity_train = X_train.Severity
Y_severity_test = X_test.Severity

# Encode Text with BERT
X_text_src_board_train = severity.format_inputs(X_features_train, max_len = 100)
X_text_src_board_test = severity.format_inputs(X_features_test, max_len = 100)

In [None]:
#Train Model
severity.train_combined(X_text_src_board_train,Y_severity_train, save_model = "Y", export_path = "./", verbose=2)

#Use Model to Predict
pred_probs = severity.predict_combined(X_predict = X_text_src_board_test, import_path = "./svm_severity_combined.joblib", verbose = 2)

print(confusion_matrix(Y_severity_test,pred_probs.Predict))

In [None]:
test_results = pred_probs.copy()
test_results = test_results.assign(text = X_test.combined_text, Actual = X_test.Severity)
test_results

# Impact Prediction

In [None]:
# Combine Text and OHE Source with Board Predictions 
import impact 

X_features_train = severity.add_board_predictions(X_train, board_predict = board_train)
X_features_test = severity.add_board_predictions(X_test, board_predict = board_test)

# Get Y Labels
Y_impact_train = X_train.Impact
Y_impact_test = X_test.Impact

# Encode Text with BERT
X_text_src_board_train = severity.format_inputs(X_features_train, max_len = 100) 
X_text_src_board_test = severity.format_inputs(X_features_test, max_len = 100) 

In [None]:
#Train Model
impact.train_combined(X_text_src_board_train,Y_impact_train, export_path = "./", verbose=2)

#Use Model to Predict
pred_probs = impact.predict_combined(X_predict = X_text_src_board_test, import_path = "./svm_severity_combined.joblib", verbose = 2)

print(confusion_matrix(Y_impact_test,pred_probs.Predict))

In [None]:
test_results = pred_probs.copy()
test_results = test_results.assign(text = X_test.combined_text, Actual = X_test.Impact)
test_results