In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

#Dataset import
df = pd.read_csv('data/raw/customer_support_tickets.csv')

In [2]:
#Handle Duplicate Entries
df.drop_duplicates(inplace=True)
df.reset_index(drop=True, inplace=True)
# Handle Missing Values
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
#Display the first few rows to confirm
print("\nSample of the initial data:")
display(df.head())


Sample of the initial data:


  df.fillna(method='ffill', inplace=True)
  df.fillna(method='bfill', inplace=True)


Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,Case maybe show recently my computer follow.,Critical,Social media,2023-06-01 12:15:36,2023-06-01 18:05:38,3.0
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,Case maybe show recently my computer follow.,Critical,Chat,2023-06-01 16:45:38,2023-06-01 18:05:38,3.0
2,3,Christopher Robbins,gonzalestracy@example.com,48,Other,Dell XPS,2020-07-14,Technical issue,Network problem,I'm facing a problem with my {product_purchase...,Closed,Case maybe show recently my computer follow.,Low,Social media,2023-06-01 11:14:38,2023-06-01 18:05:38,3.0
3,4,Christina Dillon,bradleyolson@example.org,27,Female,Microsoft Office,2020-11-13,Billing inquiry,Account access,I'm having an issue with the {product_purchase...,Closed,Try capital clearly never color toward story.,Low,Social media,2023-06-01 07:29:40,2023-06-01 01:57:40,3.0
4,5,Alexander Carroll,bradleymark@example.com,67,Female,Autodesk AutoCAD,2020-02-04,Billing inquiry,Data loss,I'm having an issue with the {product_purchase...,Closed,West decision evidence bit.,Low,Email,2023-06-01 00:12:42,2023-06-01 19:53:42,1.0


In [4]:
import re

def preprocess_text(text):
    """
    Cleans and preprocesses a single text document.
    - Converts to lowercase
    - Removes punctuation and special characters
    - Removes numbers
    - Removes extra whitespace
    """
    if not isinstance(text, str):
        return ""

    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation and special characters (keeping only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Remove numbers - this is technically covered by the regex above,
    # but explicitly showing it is good for clarity.
    # text = re.sub(r'\d+', '', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    return text

# --- Apply the preprocessing function to the 'Ticket Description' column ---
print("Applying text preprocessing...")
df['Processed Description'] = df['Ticket Description'].apply(preprocess_text)
print("Preprocessing complete.")

# --- Let's see the results: Compare original vs. processed text ---
print("\n--- Before vs. After Preprocessing ---")
pd.set_option('display.max_colwidth', None) # To see the full text
display(df[['Ticket Description', 'Processed Description']].head())

Applying text preprocessing...
Preprocessing complete.

--- Before vs. After Preprocessing ---


Unnamed: 0,Ticket Description,Processed Description
0,"I'm having an issue with the {product_purchased}. Please assist.\n\nYour billing zip code is: 71701.\n\nWe appreciate that you have requested a website address.\n\nPlease double check your email address. I've tried troubleshooting steps mentioned in the user manual, but the issue persists.",im having an issue with the productpurchased please assist your billing zip code is we appreciate that you have requested a website address please double check your email address ive tried troubleshooting steps mentioned in the user manual but the issue persists
1,"I'm having an issue with the {product_purchased}. Please assist.\n\nIf you need to change an existing product.\n\nI'm having an issue with the {product_purchased}. Please assist.\n\nIf The issue I'm facing is intermittent. Sometimes it works fine, but other times it acts up unexpectedly.",im having an issue with the productpurchased please assist if you need to change an existing product im having an issue with the productpurchased please assist if the issue im facing is intermittent sometimes it works fine but other times it acts up unexpectedly
2,"I'm facing a problem with my {product_purchased}. The {product_purchased} is not turning on. It was working fine until yesterday, but now it doesn't respond.\n\n1.8.3 I really I'm using the original charger that came with my {product_purchased}, but it's not charging properly.",im facing a problem with my productpurchased the productpurchased is not turning on it was working fine until yesterday but now it doesnt respond i really im using the original charger that came with my productpurchased but its not charging properly
3,"I'm having an issue with the {product_purchased}. Please assist.\n\nIf you have a problem you're interested in and I'd love to see this happen, please check out the Feedback. I've already contacted customer support multiple times, but the issue remains unresolved.",im having an issue with the productpurchased please assist if you have a problem youre interested in and id love to see this happen please check out the feedback ive already contacted customer support multiple times but the issue remains unresolved
4,I'm having an issue with the {product_purchased}. Please assist.\n\n\nNote: The seller is not responsible for any damages arising out of the delivery of the battleground game. Please have the game in good condition and shipped to you I've noticed a sudden decrease in battery life on my {product_purchased}. It used to last much longer.,im having an issue with the productpurchased please assist note the seller is not responsible for any damages arising out of the delivery of the battleground game please have the game in good condition and shipped to you ive noticed a sudden decrease in battery life on my productpurchased it used to last much longer


In [5]:
# --- Extract the cleaned text into a list ---
# This list will be the primary input for our BERTopic model.
documents = df['Processed Description'].tolist()

# --- Verify the final output ---
print(f"Successfully created a list of {len(documents)} documents.")
print("\nFirst 5 processed documents for the model:")
for doc in documents[:5]:
    print(f"- {doc}")

# --- Keep the DataFrame for context ---
# It's important to keep the original DataFrame. After we find topics,
# we can map them back to the original ticket details, ticket types, etc.
# For example, we could check if a topic like "password reset" is mostly
# associated with the "Technical support" Ticket Type.

Successfully created a list of 8469 documents.

First 5 processed documents for the model:
- im having an issue with the productpurchased please assist your billing zip code is we appreciate that you have requested a website address please double check your email address ive tried troubleshooting steps mentioned in the user manual but the issue persists
- im having an issue with the productpurchased please assist if you need to change an existing product im having an issue with the productpurchased please assist if the issue im facing is intermittent sometimes it works fine but other times it acts up unexpectedly
- im facing a problem with my productpurchased the productpurchased is not turning on it was working fine until yesterday but now it doesnt respond i really im using the original charger that came with my productpurchased but its not charging properly
- im having an issue with the productpurchased please assist if you have a problem youre interested in and id love to see this 

In [6]:
#Feature Engineering
from sentence_transformers import SentenceTransformer

# --- Step 1: Load a Pre-trained Model ---
# 'all-MiniLM-L6-v2' is a popular and efficient model.
# It's great for tasks like clustering and semantic search.
# The first time you run this, it will download the model (a few hundred MB).
model_name = 'all-MiniLM-L6-v2'
embedding_model = SentenceTransformer(model_name)
print(f"Loaded sentence-transformer model: '{model_name}'")

# --- Step 2: Define Some Example Sentences ---
# Let's use examples relevant to our dataset.
example_tickets = [
    "I can't log in, I need to reset my password.", # Technical/Account
    "My user credentials are not working.",        # Technical/Account
    "I was charged twice for my subscription this month.", # Billing
    "Please send me a copy of my recent invoice.",      # Billing
    "Can you add a feature to export data to CSV?",     # Feature Request
]

# --- Step 3: Generate the Embeddings ---
# The model's .encode() method takes a list of strings and returns a list of vectors.
print("\nGenerating embeddings for example tickets...")
example_embeddings = embedding_model.encode(example_tickets)

# --- Step 4: Inspect the "Features" ---
# Let's look at what we've created.
print(f"\nShape of the embeddings array: {example_embeddings.shape}")
print(f"This means we have {example_embeddings.shape[0]} embeddings, each with {example_embeddings.shape[1]} dimensions.")

print("\nEmbedding for the first ticket:")
print(example_embeddings[0])

  from .autonotebook import tqdm as notebook_tqdm


Loaded sentence-transformer model: 'all-MiniLM-L6-v2'

Generating embeddings for example tickets...

Shape of the embeddings array: (5, 384)
This means we have 5 embeddings, each with 384 dimensions.

Embedding for the first ticket:
[-2.63578556e-02 -8.03123564e-02 -3.61161269e-02 -6.73727319e-02
 -1.47047834e-02  1.09731881e-02 -3.90267745e-03 -1.54606355e-02
  3.24102566e-02  8.89787823e-03 -3.34946439e-02  4.50498089e-02
 -3.35606909e-03  2.30902676e-02  1.12274466e-02 -5.65842073e-03
 -3.70201617e-02  9.08199474e-02 -3.91956046e-02  3.27349752e-02
  1.36003466e-02 -7.03713344e-03 -8.72745067e-02 -3.97684146e-03
 -3.79699934e-03 -3.88849229e-02 -2.63107046e-02  5.00894152e-03
 -1.16991162e-01 -8.19111429e-03 -3.03763263e-02  3.87527123e-02
  4.32870677e-03 -1.14133293e-02  6.42795786e-02 -1.68435983e-02
 -3.70185338e-02  7.80365476e-03  7.68470243e-02 -3.50628309e-02
 -6.97272122e-02  2.80128717e-02 -5.12740687e-02  3.55680473e-02
  6.03536814e-02  3.51154059e-02 -6.96393996e-02  1.

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

# --- Compare two similar tickets about account access ---
account_similarity = cosine_similarity(
    [example_embeddings[0]], # "I can't log in..."
    [example_embeddings[1]]  # "My user credentials..."
)
print(f"Similarity between two ACCOUNT tickets: {account_similarity[0][0]:.4f}")

# --- Compare two similar tickets about billing ---
billing_similarity = cosine_similarity(
    [example_embeddings[2]], # "I was charged twice..."
    [example_embeddings[3]]  # "Please send me an invoice..."
)
print(f"Similarity between two BILLING tickets: {billing_similarity[0][0]:.4f}")

# --- Compare two completely different tickets ---
dissimilarity = cosine_similarity(
    [example_embeddings[0]], # "I can't log in..."
    [example_embeddings[4]]  # "Can you add a feature..."
)
print(f"Similarity between an ACCOUNT and FEATURE REQUEST ticket: {dissimilarity[0][0]:.4f}")

Similarity between two ACCOUNT tickets: 0.5958
Similarity between two BILLING tickets: 0.2823
Similarity between an ACCOUNT and FEATURE REQUEST ticket: -0.0297


In [8]:
#proceessed dataset
df.to_csv('data/processed/customer_support_tickets_processed.csv', index=False)
print("Processed dataset saved to data/processed/customer_support_tickets_processed.csv")

Processed dataset saved to data/processed/customer_support_tickets_processed.csv
