In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
import re
import os
from tqdm import tqdm
import torch
from src.config import RAW_DATA_DIR, INTERIM_DATA_DIR

categories = {'order-modifications': ['item-additions',
  'item-removals',
  'quantity-adjustments',
  'late-modification-requests'],
 'order-cancellations': ['standard-cancellations',
  'urgent-cancellations',
  'rescheduled-orders'],
 'delivery-issues': ['late-deliveries',
  'missed-deliveries',
  'incomplete-deliveries',
  'damaged-goods',
  'delivery-confirmation-issues'],
 'pickup-scheduling-&-rescheduling': ['new-pickup-requests',
  'rescheduling-pickup',
  'missed-pickups',
  'pickup-policy-clarifications'],
 'product-availability-&-substitutions': ['stock-availability-inquiries',
  'out-of-stock-notifications',
  'product-substitution-requests',
  'special-item-requests'],
 'grant-&-billing-issues': ['grant-fund-usage',
  'incorrect-grant-deduction',
  'billing-discrepancies',
  'payment-&-credit-issues'],
 'training-&-account-access': ['training-signups',
  'missed-training-sessions',
  'login-issues',
  'new-user-account-requests'],
 'emergency-situations': ['weather-related-disruptions',
  'personal/organization-emergencies',
  'food-safety-concerns',
  'unexpected-facility-closures'],
 'special-requests': ['educational-materials',
  'large-event-orders',
  'holiday-&-seasonal-adjustments'],
 'technical-support': ['website-&-ordering-system-errors',
  'email-&-communication-issues',
  'data-entry-mistakes',
  'general-it-assistance'],
 'other': ['miscellaneous']}

def read_ticket_data(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
        content = file.read()
    
    # Split the content by the ticket separator
    tickets = content.split("END OF TICKET")
    
    # Remove empty tickets and strip whitespace
    tickets = [ticket.strip() for ticket in tickets if ticket.strip()]
    
    return tickets

def preprocess_ticket(ticket):
    if ticket.lower() == "nan" or ticket.strip() == "":
        return "other/miscellaneous"
    
    # Remove special characters and formatting
    ticket = re.sub(r'\{color:[^}]*\}|\{color\}|\{quote\}|\{adf\}.*?\{adf\}', ' ', ticket)
    ticket = re.sub(r'<[^>]*>', ' ', ticket)
    ticket = re.sub(r'!https?://[^\s!]*!', ' ', ticket)
    ticket = re.sub(r'\[[^\]]*\]', ' ', ticket)
    ticket = re.sub(r'https?://\S+', ' ', ticket)
    ticket = re.sub(r'\s+', ' ', ticket)
    
    return ticket.strip()

def generate_class_labels():
    labels = []
    for category, subcategories in categories.items():
        for subcategory in subcategories:
            labels.append(f"{category}/{subcategory}")
    return labels

def classify_ticket(ticket, classifier, class_labels):
    if ticket.lower() == "empty ticket":
        return "other/miscellaneous"
    
    # Use the classifier to predict the category
    result = classifier(ticket, class_labels, multi_label=False)
    return result['labels'][0] 

In [2]:
tickets = read_ticket_data(RAW_DATA_DIR / "combined_ticket_data.txt")
preprocessed_tickets = [preprocess_ticket(ticket) for ticket in tickets]
class_labels = generate_class_labels()
    
print("Loading zero-shot classification model...")
classifier = pipeline("zero-shot-classification", 
                        model="facebook/bart-large-mnli",
                        device="cuda")

Loading zero-shot classification model...


Device set to use cuda


In [3]:
result = classifier(preprocessed_tickets, class_labels, multi_label=False)

In [4]:
output = [o['labels'][0] for o in result]

In [6]:
pd.DataFrame(result).to_csv('raw_with_cat.csv', index=False)

In [7]:
raw_xl = pd.read_excel(RAW_DATA_DIR / 'cafb_data_case2.xlsx')
raw_xl

Unnamed: 0,Summary,Issue key,Issue id,Issue Type,Status,Project key,Project name,Priority,Resolution,Assignee,...,Comment.11,Comment.12,Comment.13,Comment.14,Comment.15,Comment.16,Comment.17,Comment.18,Comment.19,Comment.20
0,SO-XXXXXX - TEST,PART1-5272,28196,[System] Service request,Closed,PART1,Partner Support,Medium,Done,BM,...,,,,,,,,,,
1,test 10,PART1-3843,28206,[System] Service request,Closed,PART1,Partner Support,Medium,Done,DH,...,,,,,,,,,,
2,Test 5 - PART,PART1-6049,28107,[System] Service request,Closed,PART1,Partner Support,Medium,Done,DH,...,,,,,,,,,,
3,Re: CAFB Pick Up TOMORROW,PART1-7176,65688,[System] Service request,Closed,PART1,Partner Support,Medium,Done,CH,...,,,,,,,,,,
4,addition,PART1-6512,55866,[System] Service request,Closed,PART1,Partner Support,Medium,Done,CH,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Produce Request SO192376,PART1-8375,80442,[System] Service request,Closed,PART1,Partner Support,Medium,Done,CR,...,,,,,,,,,,
996,Got this vmail,PART1-7890,73968,[System] Service request,Closed,PART1,Partner Support,Medium,Done,DH,...,,,,,,,,,,
997,Prince George's Community College,PART1-5081,24725,[System] Service request,Closed,PART1,Partner Support,Medium,Done,DH,...,,,,,,,,,,
998,New Shopper Account,PART1-7182,65731,[System] Service request,Closed,PART1,Partner Support,Medium,Done,CR,...,,,,,,,,,,


In [19]:
description = pd.Series(read_ticket_data(RAW_DATA_DIR / 'raw_description_processed.txt'), name='Desc')

In [None]:
main_cat = pd.Series([out.split('/')[0] for out in output], name='Main Category')
fine_cat = pd.Series([out.split('/')[1] for out in output], name='Sub-Category')
title = pd.Series(output, name='Title')
df = pd.concat((title, description, main_cat, fine_cat, raw_xl), axis=1)
df['Comments'] = df.filter(like='Comment').apply(lambda x: '\n'.join(x.dropna()), axis=1)
#df.to_csv(INTERIM_DATA_DIR / 'ticket_data.csv', index=False)

In [3]:
df = pd.read_csv(INTERIM_DATA_DIR / 'ticket_data.csv')

In [4]:
df.columns = df.columns.str.replace(r'^Custom field \((.*?)\)$', r'\1', regex=True)
df

Unnamed: 0,Title,Desc,Main Category,Sub-Category,Summary,Issue key,Issue id,Issue Type,Status,Project key,...,Comment.12,Comment.13,Comment.14,Comment.15,Comment.16,Comment.17,Comment.18,Comment.19,Comment.20,Comments
0,Other/Miscellaneous,Empty ticket,Other,Miscellaneous,SO-XXXXXX - TEST,PART1-5272,28196,[System] Service request,Closed,PART1,...,,,,,,,,,,07/10/2023 01:07;5fb17b020dd553006f17ff0a;Hi D...
1,Other/Miscellaneous,Empty ticket,Other,Miscellaneous,test 10,PART1-3843,28206,[System] Service request,Closed,PART1,...,,,,,,,,,,04/26/2024 07:53;557058:f58131cb-b67d-43c7-b30...
2,Other/Miscellaneous,This is a description,Other,Miscellaneous,Test 5 - PART,PART1-6049,28107,[System] Service request,Closed,PART1,...,,,,,,,,,,07/07/2023 06:59;5fb17b020dd553006f17ff0a;Work...
3,Order Cancellations/Rescheduled Orders,Sent email Friday to cancel order Yvonne Brown...,Order Cancellations,Rescheduled Orders,Re: CAFB Pick Up TOMORROW,PART1-7176,65688,[System] Service request,Closed,PART1,...,,,,,,,,,,08/14/2024 09:57;557058:f58131cb-b67d-43c7-b30...
4,Order Modifications/Item Additions,Can you please add 25 cases of the following t...,Order Modifications,Item Additions,addition,PART1-6512,55866,[System] Service request,Closed,PART1,...,,,,,,,,,,05/07/2024 12:17;557058:f58131cb-b67d-43c7-b30...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Order Modifications/Quantity Adjustments,Thank you!,Order Modifications,Quantity Adjustments,Produce Request SO192376,PART1-8375,80442,[System] Service request,Closed,PART1,...,,,,,,,,,,01/27/2025 11:47;712020:38c82492-84fe-40dc-890...
996,Other/Miscellaneous,Number: 301.648.4884 Hello! This is Christiana...,Other,Miscellaneous,Got this vmail,PART1-7890,73968,[System] Service request,Closed,PART1,...,,,,,,,,,,11/20/2024 11:12;712020:8bb74f6d-e197-4782-a07...
997,Other/Miscellaneous,Empty ticket,Other,Miscellaneous,Prince George's Community College,PART1-5081,24725,[System] Service request,Closed,PART1,...,,,,,,,,,,04/26/2023 10:20;61536ea272f6970069fc1dbd;Good...
998,Pickup Scheduling & Rescheduling/New Pickup Re...,Empty ticket,Pickup Scheduling & Rescheduling,New Pickup Requests,New Shopper Account,PART1-7182,65731,[System] Service request,Closed,PART1,...,,,,,,,,,,08/14/2024 12:58;qm:65d20c5a-0c84-458e-8ce1-ad...


In [None]:
df['Main Category'] = df['Main Category'].str.lower().str.replace(r'\s+', '-', regex=True)
df['Sub-Category'] = df['Sub-Category'].str.lower().str.replace(r'\s+', '-', regex=True)
df

Unnamed: 0,Title,Desc,Main Category,Sub-Category,Summary,Issue key,Issue id,Issue Type,Status,Project key,...,Comment.12,Comment.13,Comment.14,Comment.15,Comment.16,Comment.17,Comment.18,Comment.19,Comment.20,Comments
0,Other/Miscellaneous,Empty ticket,other,miscellaneous,SO-XXXXXX - TEST,PART1-5272,28196,[System] Service request,Closed,PART1,...,,,,,,,,,,07/10/2023 01:07;5fb17b020dd553006f17ff0a;Hi D...
1,Other/Miscellaneous,Empty ticket,other,miscellaneous,test 10,PART1-3843,28206,[System] Service request,Closed,PART1,...,,,,,,,,,,04/26/2024 07:53;557058:f58131cb-b67d-43c7-b30...
2,Other/Miscellaneous,This is a description,other,miscellaneous,Test 5 - PART,PART1-6049,28107,[System] Service request,Closed,PART1,...,,,,,,,,,,07/07/2023 06:59;5fb17b020dd553006f17ff0a;Work...
3,Order Cancellations/Rescheduled Orders,Sent email Friday to cancel order Yvonne Brown...,order-cancellations,rescheduled-orders,Re: CAFB Pick Up TOMORROW,PART1-7176,65688,[System] Service request,Closed,PART1,...,,,,,,,,,,08/14/2024 09:57;557058:f58131cb-b67d-43c7-b30...
4,Order Modifications/Item Additions,Can you please add 25 cases of the following t...,order-modifications,item-additions,addition,PART1-6512,55866,[System] Service request,Closed,PART1,...,,,,,,,,,,05/07/2024 12:17;557058:f58131cb-b67d-43c7-b30...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Order Modifications/Quantity Adjustments,Thank you!,order-modifications,quantity-adjustments,Produce Request SO192376,PART1-8375,80442,[System] Service request,Closed,PART1,...,,,,,,,,,,01/27/2025 11:47;712020:38c82492-84fe-40dc-890...
996,Other/Miscellaneous,Number: 301.648.4884 Hello! This is Christiana...,other,miscellaneous,Got this vmail,PART1-7890,73968,[System] Service request,Closed,PART1,...,,,,,,,,,,11/20/2024 11:12;712020:8bb74f6d-e197-4782-a07...
997,Other/Miscellaneous,Empty ticket,other,miscellaneous,Prince George's Community College,PART1-5081,24725,[System] Service request,Closed,PART1,...,,,,,,,,,,04/26/2023 10:20;61536ea272f6970069fc1dbd;Good...
998,Pickup Scheduling & Rescheduling/New Pickup Re...,Empty ticket,pickup-scheduling-&-rescheduling,new-pickup-requests,New Shopper Account,PART1-7182,65731,[System] Service request,Closed,PART1,...,,,,,,,,,,08/14/2024 12:58;qm:65d20c5a-0c84-458e-8ce1-ad...


In [None]:
#df.to_csv(INTERIM_DATA_DIR / 'ticket_data.csv', index=False)

In [4]:
df.columns

Index(['Title', 'Desc', 'Main Category', 'Sub-Category', 'Summary',
       'Issue key', 'Issue id', 'Issue Type', 'Status', 'Project key',
       'Project name', 'Priority', 'Resolution', 'Assignee',
       'Reporter (Email)', 'Creator (Email)', 'Created', 'Updated',
       'Last Viewed', 'Resolved', 'Due date', 'Description', 'Partner Names',
       'Cause of issue', 'Record/Transaction ID', 'Region',
       'Relevant Departments', 'Custom field (Relevant Departments).1',
       'Request Category', 'Request Type', 'Request language',
       'Resolution Action', 'Satisfaction rating', 'Satisfaction date',
       'Source', 'Time to first response', 'Time to resolution',
       'Work category', 'Status Category', 'Status Category Changed',
       '[CHART] Date of First Response', 'Comment', 'Comment.1', 'Comment.2',
       'Comment.3', 'Comment.4', 'Comment.5', 'Comment.6', 'Comment.7',
       'Comment.8', 'Comment.9', 'Comment.10', 'Comment.11', 'Comment.12',
       'Comment.13', 'Comme