In [None]:
import pandas as pd
import re
import string
import spacy
from tqdm import tqdm
import numpy as np

In [None]:
# Read wildchat_en_cleaned data
wildchat_en_path = "/Users/hoyiwong/Library/CloudStorage/OneDrive-SharedLibraries-UniversityofWaterloo/Jiahe Huang - MSE 641 Project Data/wildchat_en_cleaned.jsonl"
wildchat_en = pd.read_json(wildchat_en_path, lines=True)

Data preprocessing

In [None]:
# Basic data inspection for wildchat_en dataset

print("Total rows:", wildchat_en.shape[0]) # Total number of rows

duplicate_count = wildchat_en["conversation_id"].duplicated().sum() # Count duplicate conversation_id entries
print("Duplicate conversation_id count:", duplicate_count)

turn_min, turn_max = wildchat_en["turn"].agg(["min", "max"]) # Show range of 'turn'
print(f"Turn column range: {turn_min} to {turn_max}")

true_toxic = wildchat_en["toxic"].sum() # Count of rows with redacted == True
print("Total True in 'toxic':", true_toxic)

true_redacted = wildchat_en["redacted"].sum() # Count of rows with redacted == True
print("Total True in 'redacted':", true_redacted)


In [None]:
# Remove unnecessary columns
wildchat_en_cleaned= wildchat_en.drop(columns=["language", "openai_moderation", "detoxify_moderation", "toxic", "redacted"])

In [None]:
# Split conversation into prompt and response column

# Create empty list for result
rows = []

for row in wildchat_en.itertuples(index=False):
    convo = row.conversation
    prompts = []
    responses = []

    i = 0
    while i < len(convo) - 1:
        user, assistant = convo[i], convo[i + 1]
        if user["role"] == "user" and assistant["role"] == "assistant":
            prompts.append(user["content"])
            responses.append(assistant["content"])
            i += 2
        else:
            i += 1

    # Combine all turns into single prompt/response thread
    rows.append((
        row.conversation_id,
        row.model,
        row.timestamp,
        row.turn,
        "\n\n".join(prompts),
        "\n\n".join(responses),
        row.toxic,
        row.redacted,
    ))

# Convert to DataFrame
flattened_df = pd.DataFrame(rows, columns=[
    "conversation_id", "model", "timestamp", "turn", "prompt", "response", "toxic", "redacted",
])

In [None]:
# Convert all text to lowercase
flattened_df["prompt"] = flattened_df["prompt"].str.lower()
flattened_df["response"] = flattened_df["response"].str.lower()

In [None]:
# Remove all punctuation and special characters
RE_PUNCT = re.compile(f"[{re.escape(string.punctuation)}]")
RE_MOJIBAKE = re.compile(r"[^\w\s]{3,}")

def remove_encoding_garbage(text):
    if not isinstance(text, str):
        return ""

    # ASCII encode-decode strip (mojibake nuke)
    text = text.encode("ascii", "ignore").decode("utf-8", "ignore")

    # Fast regex cleanup
    text = RE_PUNCT.sub("", text)
    text = RE_MOJIBAKE.sub("", text)

    return text.strip()

# Append to prompt and response coloumns
flattened_df["prompt"] = flattened_df["prompt"].apply(remove_encoding_garbage)
flattened_df["response"] = flattened_df["response"].apply(remove_encoding_garbage)


In [None]:
import nltk
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")

POS-filtered Lemmatization

In [None]:
# Load spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Define function for POS-filtered lemmatization
def pos_filtered_lemmatizer(text):
    # Remove non-alphabetic characters (retain space)
    text = re.sub(r"[^a-zA-Z\s]", "", str(text))
    doc = nlp(text)
    return " ".join([
        token.lemma_ for token in doc
        if token.pos_ in {"NOUN", "VERB"}  # Filter out ADJ, ADV, etc.
        and not token.is_stop
        and token.is_alpha
    ])

# Load dataset
flattened_df = pd.read_json("data/wildchat_en_cleaned.jsonl", lines=True)

# Checking progress
tqdm.pandas(desc="Progress")

# Lemmatized 'prompt' and 'response' columns
flattened_df["prompt"] = flattened_df["prompt"].progress_apply(pos_filtered_lemmatizer)

# Save output
flattened_df.to_json("data/wildchat_en_lemmatized.jsonl", orient="records", lines=True, force_ascii=False)


In [None]:
# Load spaCy
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

# Load dataset
flattened_df = pd.read_json("data/wildchat_en_lemmatized.jsonl", lines=True)
flattened_df = flattened_df.drop(columns=["toxic", "redacted"])

# Efficient lemmatization function using spaCy pipe
def lemmatize_full(texts, n_process=2, batch_size=300):
    return [
        " ".join(token.lemma_ for token in doc if token.is_alpha)
        for doc in tqdm(nlp.pipe(texts, batch_size=batch_size, n_process=n_process), total=len(texts), desc="Progress")
    ]

# Apply to 'prompt' and 'response'
flattened_df["prompt"] = lemmatize_full(flattened_df["prompt"].fillna(""))
flattened_df["response"] = lemmatize_full(flattened_df["response"].fillna(""))

# Save output
flattened_df.to_json("data/wildchat_en_lemmatized.jsonl", orient="records", lines=True, force_ascii=False)


In [None]:
# Checking sample result
df = pd.read_json("data/wildchat_en_lemmatized.jsonl", lines=True)

# Randomly sample 20 rows
sample_df = df.sample(n=20, random_state=42)

# Save to CSV
sample_df.to_csv("data/sample_wildchat_en_lemmatized_sample.csv", index=False)


Remove stop words

In [None]:
from nltk.corpus import stopwords

# Make sure stopwords are available
nltk.download("stopwords")
nltk.download("punkt")

# Load English stopwords
stop_words = set(stopwords.words("english"))
stop_words.update(["prompt", "ar", "hi", "pleas"]) # Update customize stopwords

# Load spaCy model
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])
nlp.Defaults.stop_words |= stop_words

# Define remove stopwords function
def remove_stopwords(texts, n_process=2, batch_size=100):
    cleaned = []
    for doc in tqdm(nlp.pipe(texts, n_process=n_process, batch_size=batch_size), total=len(texts), desc="Removing stopwords"):
        cleaned.append(" ".join(token.text for token in doc if not token.is_stop and token.is_alpha))
    return cleaned

# Load dataset
flattened_df = pd.read_json("data/wildchat_en_lemmatized.jsonl", lines=True)

# Apply with progress
flattened_df["prompt"] = remove_stopwords(flattened_df["prompt"].fillna(""))
flattened_df["response"] = remove_stopwords(flattened_df["response"].fillna(""))

# Save cleaned data
flattened_df.to_json("data/wildchat_cleaned_final.jsonl", orient="records", lines=True, force_ascii=False)


In [None]:
# Save a preview sample to CSV
flattened_df.head(20).to_csv("data/wildchat_cleaned_final_sample.csv", index=False)


Phrase Detection


In [None]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [None]:
# Load data
flattened_df = pd.read_json("data/wildchat_cleaned_final.jsonl", lines=True)

# Combine 'prompt' and 'response' into conversation
flattened_df["conversation"] = flattened_df["prompt"].fillna("") + " " + flattened_df["response"].fillna("")

# Tokenize by whitespace
tokenized_texts = [text.split() for text in flattened_df["conversation"]]

# Train Gensim Phrases mo
bigram_model = Phrases(tokenized_texts, min_count=10, threshold=15)
trigram_model = Phrases(bigram_model[tokenized_texts], min_count=10, threshold=10)
bigram_phraser = Phraser(bigram_model)
trigram_phraser = Phraser(trigram_model)

# Apply phrasers
flattened_df["phrase_tokens"] = [
    trigram_phraser[bigram_phraser[tokens]] for tokens in tqdm(tokenized_texts, desc="Applying Phrase Detection")
]

# Join tokens back into string format
flattened_df["phrase_text"] = [" ".join(tokens) for tokens in flattened_df["phrase_tokens"]]

# Save final version
flattened_df.to_json("data/wildchat_phrase_detected.jsonl", orient="records", lines=True, force_ascii=False)

In [None]:
# Checking
import itertools
from collections import Counter

# Count detected phrases with underscores
all_phrases = list(itertools.chain.from_iterable(flattened_df["phrase_tokens"]))
phrase_counts = Counter([tok for tok in all_phrases if "_" in tok])

# Show top 30 phrases
print("\nTop Phrases:")
for phrase, count in phrase_counts.most_common(30):
    print(f"{phrase}: {count}")

Text Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
from scipy.sparse import vstack

# Load data
flattened_df = pd.read_json("data/wildchat_phrase_detected.jsonl", lines=True)
conversation = flattened_df["phrase_text"].tolist()

# Init vectorizer 
vectorizer = TfidfVectorizer(
    max_features=3000,        # fewer dimensions = faster
    ngram_range=(1, 2),       # unigrams + bigrams only
    lowercase=False,
    max_df=0.95,
    min_df=10,
    norm='l2',
    use_idf=True,
    smooth_idf=True
)

# Fit on full data
print("Fitting vectorizer...")
vectorizer.fit(conversation)

# Transform in chunks to reduce memory usage
chunk_size = 10000
X_parts = []

with tqdm(total=len(conversation), desc="TF-IDF transforming") as pbar:
    for i in range(0, len(conversation), chunk_size):
        chunk = conversation[i:i+chunk_size]
        X_chunk = vectorizer.transform(chunk)
        X_parts.append(X_chunk)
        pbar.update(len(chunk))

X_convo = vstack(X_parts)

# Save vectorizer and matrix
joblib.dump(vectorizer, "data/tfidf_vectorizer_fast.pkl")
joblib.dump(X_convo, "data/tfidf_matrix_fast.pkl")

print("Matrix shape:", X_convo.shape)
print("Vocabulary size:", len(vectorizer.get_feature_names_out()))

In [None]:
# Get feature names and average TF-IDF score
feature_names = np.array(vectorizer.get_feature_names_out())
mean_tfidf_scores = X_convo.mean(axis=0).A1

# Check Top 50 
top_indices = mean_tfidf_scores.argsort()[::-1][:50]
top_features = feature_names[top_indices]
top_scores = mean_tfidf_scores[top_indices]

# Print the result
print("Top 20 Most Important TF-IDF Features in conversation:")
for feature, score in zip(top_features, top_scores):
    print(f"{feature:<30} {score:.4f}")