In [2]:
import pandas as pd
import json
import joblib
from io import StringIO
import os
import re
import string
import spacy
from tqdm import tqdm
import numpy as np

In [5]:
# Read wildchat_en_cleaned data
wildchat_en_path = "/Users/hoyiwong/Library/CloudStorage/OneDrive-SharedLibraries-UniversityofWaterloo/Jiahe Huang - MSE 641 Project Data/wildchat_en_cleaned.jsonl"
wildchat_en = pd.read_json(wildchat_en_path, lines=True)

# Data Prepocessing

In [6]:
# Basic data inspection for wildchat_en dataset

print("Total rows:", wildchat_en.shape[0]) # Total number of rows

duplicate_count = wildchat_en["conversation_id"].duplicated().sum() # Count duplicate conversation_id entries
print("Duplicate conversation_id count:", duplicate_count)

turn_min, turn_max = wildchat_en["turn"].agg(["min", "max"]) # Show range of 'turn' 
print(f"Turn column range: {turn_min} to {turn_max}")

true_toxic = wildchat_en["toxic"].sum() # Count of rows with redacted == True
print("Total True in 'toxic':", true_toxic)

true_redacted = wildchat_en["redacted"].sum() # Count of rows with redacted == True
print("Total True in 'redacted':", true_redacted)

Total rows: 284168
Duplicate conversation_id count: 0
Turn column range: 1 to 78
Total True in 'toxic': 0
Total True in 'redacted': 2580


In [None]:
# Remove unnecessary columns
wildchat_en_cleaned= wildchat_en.drop(columns=["language", "openai_moderation", "detoxify_moderation", "toxic", "redacted"])

In [8]:
# Split conversation into prompt and response coloumn

# Create empty list for result
rows = []

for row in wildchat_en.itertuples(index=False):
    convo = row.conversation
    prompts = []
    responses = []

    i = 0
    while i < len(convo) - 1:
        user, assistant = convo[i], convo[i + 1]
        if user["role"] == "user" and assistant["role"] == "assistant":
            prompts.append(user["content"])
            responses.append(assistant["content"])
            i += 2
        else:
            i += 1

    # Combine all turns into single prompt/response thread
    rows.append((
        row.conversation_id,
        row.model,
        row.timestamp,
        row.turn,
        "\n\n".join(prompts),
        "\n\n".join(responses),
        row.toxic,
        row.redacted,
    ))

# Convert to DataFrame
flattened_df = pd.DataFrame(rows, columns=[
    "conversation_id", "model", "timestamp", "turn", "prompt", "response", "toxic", "redacted",
])

In [9]:
# Convert all text to lowercase
flattened_df["prompt"] = flattened_df["prompt"].str.lower()
flattened_df["response"] = flattened_df["response"].str.lower()

In [10]:
# Remove all punctuation and special characters
RE_PUNCT = re.compile(f"[{re.escape(string.punctuation)}]")
RE_MOJIBAKE = re.compile(r"[^\w\s]{3,}")

def remove_encoding_garbage(text):
    if not isinstance(text, str):
        return ""

    # ASCII encode-decode strip (mojibake nuke)
    text = text.encode("ascii", "ignore").decode("utf-8", "ignore")

    # Fast regex cleanup
    text = RE_PUNCT.sub("", text)
    text = RE_MOJIBAKE.sub("", text)

    return text.strip()

# Append to prompt and response coloumns
flattened_df["prompt"] = flattened_df["prompt"].apply(remove_encoding_garbage)
flattened_df["response"] = flattened_df["response"].apply(remove_encoding_garbage)

In [4]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /Users/hoyiwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hoyiwong/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hoyiwong/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

# POS-filtered Lemmatization

In [2]:
# Load spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Define function for POS-filtered lemmatization 
def pos_filtered_lemmatizer(text):
    # Remove non-alphabetic characters (retain space)
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))
    doc = nlp(text)
    return " ".join([
        token.lemma_ for token in doc
        if token.pos_ in {"NOUN", "VERB"}  # Filter out ADJ, ADV, etc.
        and not token.is_stop
        and token.is_alpha
    ])

# Load dataset
flattened_df = pd.read_json("data/wildchat_en_cleaned.jsonl", lines=True)

# Checking progress
tqdm.pandas(desc="Progress")

# Lemmatized 'prompt' and 'response' columns
flattened_df["prompt"] = flattened_df["prompt"].progress_apply(pos_filtered_lemmatizer)

# Save output 
flattened_df.to_json("data/wildchat_en_lemmatized.jsonl", orient="records", lines=True, force_ascii=False)

Progress: 100%|██████████| 284168/284168 [1:12:26<00:00, 65.37it/s] 


In [None]:
# Load spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Load dataset
flattened_df = pd.read_json("data/wildchat_en_lemmatized.jsonl", lines=True)
flattened_df = flattened_df.drop(columns=["toxic", "redacted"])

# Efficient lemmatization function using spaCy pipe
def lemmatize_full(texts, n_process=2, batch_size=300):
    return [
        " ".join(token.lemma_ for token in doc if token.is_alpha)
        for doc in tqdm(nlp.pipe(texts, batch_size=batch_size, n_process=n_process), total=len(texts), desc="Progress")
    ]

# Apply to 'prompt' and 'response'
flattened_df["prompt"] = lemmatize_full(flattened_df["prompt"].fillna(""))
flattened_df["response"] = lemmatize_full(flattened_df["response"].fillna(""))

# Save output
flattened_df.to_json("data/wildchat_en_lemmatized.jsonl", orient="records", lines=True, force_ascii=False)

Progress: 100%|██████████| 284168/284168 [14:38<00:00, 323.43it/s] 
Progress: 100%|██████████| 284168/284168 [1:19:55<00:00, 59.26it/s]  


In [None]:
# Checking sample result
df = pd.read_json("data/wildchat_en_lemmatized.jsonl", lines=True)

# Randomly sample 20 rows
sample_df = df.sample(n=20, random_state=42)

# Save to CSV
sample_df.to_csv("data/sample_wildchat_en_lemmatized_sample.csv", index=False)

# Remove Stop Words

In [8]:
from nltk.corpus import stopwords

# Make sure stopwords are available
nltk.download("stopwords")
nltk.download("punkt")

# Load English stopwords
stop_words = set(stopwords.words("english"))
stop_words.update(["prompt", "ar", "hi", "pleas"]) # Update customize stopwords

# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.Defaults.stop_words |= stop_words

# Define remove stopwords function
def remove_stopwords(texts, n_process=2, batch_size=100):
    cleaned = []
    for doc in tqdm(nlp.pipe(texts, n_process=n_process, batch_size=batch_size), total=len(texts), desc="Removing stopwords"):
        cleaned.append(" ".join(token.text for token in doc if not token.is_stop and token.is_alpha))
    return cleaned

# Load dataset
flattened_df = pd.read_json("data/wildchat_en_lemmatized.jsonl", lines=True)

# Apply with progress
flattened_df["prompt"] = remove_stopwords(flattened_df["prompt"].fillna(""))
flattened_df["response"] = remove_stopwords(flattened_df["response"].fillna(""))

# Save cleaned data
flattened_df.to_json("data/wildchat_cleaned_final.jsonl", orient="records", lines=True, force_ascii=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hoyiwong/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/hoyiwong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Removing stopwords: 100%|██████████| 284168/284168 [15:15<00:00, 310.52it/s] 
Removing stopwords: 100%|██████████| 284168/284168 [1:20:09<00:00, 59.08it/s]  


In [9]:
# Save a preview sample to CSV
flattened_df.head(20).to_csv("data/wildchat_cleaned_final_sample.csv", index=False)

# Phase Detection

In [7]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

RuntimeError: Compiled extensions are unavailable. If you've installed from a package, ask the package maintainer to include compiled extensions. If you're building Gensim from source yourself, install Cython and a C compiler, and then run `python setup.py build_ext --inplace` to retry. 

In [None]:
import itertools
from collections import Counter

# Load data
flattened_df = pd.read_json("data/wildchat_cleaned_final.jsonl", lines=True)

# Combine 'prompt' and 'response' into conversation
flattened_df["conversation"] = flattened_df["prompt"].fillna("") + " " + flattened_df["response"].fillna("")

# Tokenize by whitespace
tokenized_texts = [text.split() for text in flattened_df["conversation"]]

# Train Gensim Phrases mo
bigram_model = Phrases(tokenized_texts, min_count=10, threshold=15)
trigram_model = Phrases(bigram_model[tokenized_texts], min_count=10, threshold=10)
bigram_phraser = Phraser(bigram_model)
trigram_phraser = Phraser(trigram_model)

# Apply phrasers
flattened_df["phrase_tokens"] = [
    trigram_phraser[bigram_phraser[tokens]] for tokens in tqdm(tokenized_texts, desc="Applying Phrase Detection")
]

# Join tokens back into string format
flattened_df["phrase_text"] = [" ".join(tokens) for tokens in flattened_df["phrase_tokens"]]

# Save final version
flattened_df.to_json("data/wildchat_phrase_detected.jsonl", orient="records", lines=True, force_ascii=False)

RuntimeError: Compiled extensions are unavailable. If you've installed from a package, ask the package maintainer to include compiled extensions. If you're building Gensim from source yourself, install Cython and a C compiler, and then run `python setup.py build_ext --inplace` to retry. 

# Text Vectorization

In [None]:
# TfidVectorizer (TF-IDF)

from sklearn.feature_extraction.text import TfidfVectorizer
import joblib

# Load dataset
flattened_df = pd.read_json("data/wildchat_cleaned_final.jsonl", lines=True)

# Combine prompt and response into a conversation field
flattened_df["conversation"] = flattened_df["prompt"].fillna("") + " " + flattened_df["response"].fillna("")

# Convert to list for vectorizer
conversation = flattened_df["conversation"].tolist()

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    max_features=5000,        # top 5000 features 
    ngram_range=(1, 4),       # unigrams to 4-grams 
    stop_words=None,          
    lowercase=False,          
    max_df=0.95,              
    min_df=5,                 
    norm='l2',                # L2 normalization of vectors
    use_idf=True,             
    smooth_idf=True           
)

# Fit and transform on full conversation
with tqdm(total=len(conversation), desc="TF-IDF fitting") as pbar:
    X_convo = vectorizer.fit_transform(conversation)
    pbar.update(len(conversation)) 

# Save vectorizer and matrix
joblib.dump(vectorizer, "data/tfidf_vectorizer.pkl")
joblib.dump(X_convo, "data/tfidf_matrix.pkl")

print("Matrix shape:", X_convo.shape)
print("Vocabulary size:", len(vectorizer.get_feature_names_out()))

TF-IDF fitting:   0%|          | 0/284168 [27:46<?, ?it/s]


In [None]:
# Get feature names and average TF-IDF score
feature_names = np.array(vectorizer.get_feature_names_out())
mean_tfidf_scores = X_convo.mean(axis=0).A1

# Check Top 50 
top_indices = mean_tfidf_scores.argsort()[::-1][:50]
top_features = feature_names[top_indices]
top_scores = mean_tfidf_scores[top_indices]

# Print the result
print("Top 20 Most Important TF-IDF Features in conversation:")
for feature, score in zip(top_features, top_scores):
    print(f"{feature:<30} {score:.4f}")

Top 20 Most Important TF-IDF Features:
detail                         0.0281
write                          0.0251
use                            0.0233
descript                       0.0215
thi                            0.0206
detail descript                0.0179
imag                           0.0170
style                          0.0169
gener                          0.0129
natsuki                        0.0125
imagin                         0.0124
captur                         0.0119
creat                          0.0117
like                           0.0115
ai                             0.0111
scene                          0.0111
follow                         0.0105
structur                       0.0099
sayori                         0.0099
pleas                          0.0099


# Dimensionality Reduction

In [None]:
# determine n_components
import matplotlib.pyplot as plt

# Plot the cumulative explained variance 
plt.plot(np.cumsum(svd.explained_variance_ratio_))
plt.xlabel("Number of Components")
plt.ylabel("Cumulative Explained Variance")

# Display the plot
plt.title("Explained Variance by Number of SVD Components")
plt.grid(True)
plt.show()

In [None]:
# TruncatedSVD
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, random_state=42)
X_reduced = svd.fit_transform(X_convo)

# K-mean Clustering

# Evaluation

In [None]:
# Internal Validation:

# Silhouette Score

# Calinski-Harabasz Index

# Davies-Bouldin Index

# Visual Tools:

# Elbow Plot

# Silhouette Plot

# Cluster Analysis