<a href="https://colab.research.google.com/github/jpollard44/BBB_Complaints_LLM/blob/main/BBB_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Code for Data scraper**

In [None]:
import time
import pandas as pd
import random
from bs4 import BeautifulSoup
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

ModuleNotFoundError: No module named 'undetected_chromedriver'

In [None]:
# --- Function to scrape random pages of BBB complaints ---
def scrape_bbb_complaints_random(base_url, min_page=1, max_page=200, num_samples=20, pause=1.0, timeout=10):
    """
    Scrape a random sample of complaint pages from BBB
    across a range of pages to diversify by time.
    """

    # 1) Set up headless Chrome browser using undetected-chromedriver
    options = uc.ChromeOptions()
    options.headless = False  # Set to True if you want headless mode
    options.add_argument("--disable-blink-features=AutomationControlled")  # Avoid bot detection
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/116.0.0.0 Safari/537.36"
    )
    driver = uc.Chrome(options=options)

    # Remove webdriver detection flag from JavaScript environment
    driver.execute_cdp_cmd(
        "Page.addScriptToEvaluateOnNewDocument",
        {
            "source": """
                Object.defineProperty(navigator, 'webdriver', {
                  get: () => undefined
                });
            """
        }
    )

    # 2) Randomly choose a set of pages to scrape
    pages_to_scrape = sorted(random.sample(range(min_page, max_page + 1), num_samples))
    print(f"🔀 Randomly selected pages: {pages_to_scrape}")

    all_rows = []  # List to collect complaint data

    for page in pages_to_scrape:
        url = f"{base_url}?page={page}"  # Construct page URL
        print(f"[Page {page}] → {url}")
        driver.get(url)

        try:
            # Wait for complaint elements to load on the page
            WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "li.card.bpr-complaint-grid"))
            )
        except:
            print(f"⚠️ No complaints found, skipping page {page}")
            continue

        time.sleep(pause)  # Pause to ensure page is fully loaded
        soup = BeautifulSoup(driver.page_source, "html.parser")  # Parse HTML with BeautifulSoup
        cards = soup.select("li.card.bpr-complaint-grid")  # Select complaint cards
        print(f"  • Found {len(cards)} complaint cards")

        # 3) Loop through each complaint card and extract relevant fields
        for card in cards:
            date_filed = card.select_one("p.bpr-complaint-date span")
            complaint_type = card.select_one("div.bpr-complaint-type span")
            text_div = card.select_one("div.bpr-complaint-body > div")
            biz_date = card.select_one("p.bpr-complaint-business-response-date")
            biz_body = card.select_one("div.bpr-complaint-business-response-body")
            cust_date = card.select_one("p.bpr-customer-response-date")
            cust_body = card.select_one("div.bpr-customer-response-body")
            status = card.select_one("div.bpr-complaint-status-summary")

            # Append a dictionary of cleaned values to the output list
            all_rows.append({
                "date_filed":             date_filed.get_text(strip=True)            if date_filed else "",
                "complaint_type":         complaint_type.get_text(strip=True)        if complaint_type else "",
                "complaint_text":         text_div.get_text(" ", strip=True)         if text_div else "",
                "business_response_date": biz_date.get_text(strip=True)              if biz_date else "",
                "business_response":      biz_body.get_text(" ", strip=True)         if biz_body else "",
                "customer_response_date": cust_date.get_text(strip=True)            if cust_date else "",
                "customer_response":      cust_body.get_text(" ", strip=True)        if cust_body else "",
                "status":                 status.get_text(strip=True)               if status else "",
                "page":                   page,  # Include source page number
            })

    driver.quit()  # Close browser when done
    return pd.DataFrame(all_rows)  # Return results as a DataFrame


In [None]:
# --- Main Execution ---
if __name__ == "__main__":
    BASE = "https://www.bbb.org/us/ca/san-jose/profile/payment-processing-services/paypal-inc-1216-210387/complaints"

    # Run the scraper across 300 randomly selected pages (out of 2771)
    df = scrape_bbb_complaints_random(BASE, min_page=1, max_page=2771, num_samples=300)

    print(df.head())  # Print first few rows of the collected DataFrame

    # File path for saving the scraped data
    out_path = (
        "C:/Users/jthom/OneDrive/JOSH WORKING/SCHOOL WORK/"
        "MS Business Analytics/GBA 6410 - Social Media Analytics and Text Mining/"
        "Project/Paypal_bbb_complaints_random.csv"
    )

    # Save the results as a CSV file
    df.to_csv(out_path, index=False)
    print(f"✅ Saved {out_path}")

## **Code for Analysis**

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import seaborn as sns

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.express as px
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [None]:
#Load dataset
venmo_df = pd.read_csv("/content/Venmo_bbb_complaints_random.csv")
chime_df = pd.read_csv("/content/Chime_bbb_complaints_random.csv")
paypal_df = pd.read_csv("/content/Paypal_bbb_complaints_random.csv")
square_df = pd.read_csv("/content/Square_bbb_complaints_random.csv")

#Add company name to df
venmo_df['Company'] = 'Venmo'
chime_df['Company'] = 'Chime'
paypal_df['Company'] = 'Paypal'
square_df['Company'] = 'Square'

#combine datasets from all companies
df_complete = pd.concat([venmo_df, chime_df, paypal_df, square_df], ignore_index=True)
df_complete.to_csv("/content/Complete_BBB_Corpus.csv")


### Pre-Processing

In [None]:
#text data preprocessing
def preprocessing(data):

    #convert all to string and convert to lowercase
    data['complaint_text'] = data['complaint_text'].fillna('').astype(str).str.lower()

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    clean = []

    for message in data['complaint_text']:

        #remove punctuation
        message = re.sub(r'[^\w\s]', '', message)

        #tokenization
        tokens = word_tokenize(message)

        #Remove Stop words
        filtered = [word for word in tokens if word not in stop_words]

        #Remove lemma
        filtered = [lemmatizer.lemmatize(word) for word in filtered]

        clean.append(filtered)

    return clean

### Vectorization

In [None]:
def vectorization(normalized_data):
    # Join tokens into space-separated strings
    joined_data = [' '.join(tokens) for tokens in normalized_data]

    tv = TfidfVectorizer(min_df=0.10, max_df=0.95, norm='l2',
                         use_idf=True, smooth_idf=True)

    tv_matrix = tv.fit_transform(joined_data) # Fit the vectorizer on the text data and transform it into a TF-IDF matrix
    tv_matrix = tv_matrix.toarray() # Convert the resulting sparse matrix to a dense NumPy array
    vocab = tv.get_feature_names_out() # Extract the feature names (i.e., vocabulary terms)

    # Create a DataFrame from the TF-IDF array for easier manipulation
    vectorized = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

    return vectorized # Return the vectorized text data as a DataFrame

normalized_data = preprocessing(df_complete)
joined_texts = [' '.join(tokens) for tokens in normalized_data]

# Call the function to create the TF-IDF feature matrix
vectorized = vectorization(normalized_data)

In [None]:
# Compute the pairwise cosine similarity between all complaint vectors
cosine_sim_matrix = pd.DataFrame(cosine_similarity(vectorized))

# Extract the underlying NumPy array from the TF-IDF DataFrame
tfidf_array = vectorized.values

# Calculate the pairwise (condensed) Euclidean distances between all complaints
condensed_euclidean_dist = pdist(tfidf_array, metric='euclidean')

# Perform hierarchical clustering using Ward's linkage method
Z = linkage(condensed_euclidean_dist, method='ward')

### Cluster and Vizualization

In [None]:
# --- Function to Perform Clustering and Visualize with 2D PCA ---
def cluster_and_visualize_2D(tfidf_array, Z, num_clusters, company_labels):

    # Step 1: Assign cluster labels
    cluster_labels = fcluster(Z, t=num_clusters, criterion='maxclust')

    # Step 2: PCA (2 Components)
    pca = PCA(n_components=2)
    reduced_data = pca.fit_transform(tfidf_array)

    # Step 3: Build DataFrame for plotting
    cluster_df = pd.DataFrame({
        'PCA1': reduced_data[:, 0],
        'PCA2': reduced_data[:, 1],
        'Cluster': cluster_labels,
        'Company': company_labels
    })

    # Step 4: 2D Plot — color by cluster, shape by company
    plt.figure(figsize=(12, 8))
    sns.scatterplot(
        data=cluster_df,
        x='PCA1', y='PCA2',
        hue='Cluster',
        style='Company',
        palette='tab10',
        s=70,
        alpha=0.8,
        edgecolor='k'
    )

    plt.title(f'2D PCA Projection of Complaint Clusters (k={num_clusters})')
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', title="Cluster / Company")
    plt.tight_layout()
    plt.show()

    return cluster_df

# --- Call the function ---
num_clusters = 4
cluster_results_2d = cluster_and_visualize_2D(tfidf_array, Z, num_clusters, df_complete['Company'])

In [None]:
vectorized['Cluster'] = cluster_results_2d['Cluster'].values  # Add cluster labels to TF-IDF matrix

### n grams

In [None]:
def top_ngrams_by_cluster(texts, cluster_labels, n=2, top_k=10):
    df = pd.DataFrame({'text': texts, 'cluster': cluster_labels})
    cluster_ngrams = {}

    for cluster_id in sorted(df['cluster'].unique()):
        print(f"\n🔍 Top {n}-grams for Cluster {cluster_id}")
        cluster_texts = df[df['cluster'] == cluster_id]['text']

        vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
        X = vectorizer.fit_transform(cluster_texts)
        ngram_freq = X.sum(axis=0)

        ngram_counts = [(ngram, int(ngram_freq[0, idx])) for ngram, idx in vectorizer.vocabulary_.items()]
        top_ngrams = sorted(ngram_counts, key=lambda x: x[1], reverse=True)[:top_k]

        cluster_ngrams[cluster_id] = top_ngrams

        for phrase, count in top_ngrams:
            print(f"{phrase}: {count}")

    return cluster_ngrams


In [None]:
def plot_ngrams_histograms_by_cluster(ngram_data):
    clusters = sorted(ngram_data.keys())
    num_clusters = len(clusters)

    # Setup for a 2x2 grid
    rows, cols = 2, 2
    fig, axes = plt.subplots(rows, cols, figsize=(12, 10), constrained_layout=True)
    axes = axes.flatten()  # make it easy to index

    for i, cluster_id in enumerate(clusters):
        top_ngrams = ngram_data[cluster_id]
        phrases, counts = zip(*top_ngrams)

        axes[i].barh(phrases[::-1], counts[::-1], color='skyblue')
        axes[i].set_title(f"Cluster {cluster_id}")
        axes[i].set_xlabel("Frequency")
        axes[i].set_ylabel("Top N-grams")

    # Hide any unused subplots (if < 4 clusters)
    for j in range(len(clusters), len(axes)):
        axes[j].axis('off')

    plt.suptitle("Top N-grams by Cluster (2x2 Layout)", fontsize=16)
    plt.show()

In [None]:
# Compute and return top n-grams
ngram_data = top_ngrams_by_cluster(joined_texts, vectorized['Cluster'], n=2, top_k=10)

# Plot using already extracted data
plot_ngrams_histograms_by_cluster(ngram_data)


# Code for Severity_Score using OpenAI API (1hour to label 1000)

In [None]:
'''


import pandas as pd
import openai
import os
import time
import uuid
import csv
from google.colab import userdata

# Load API key from Colab Secrets
os.environ["OPENAI_API_KEY"] = userdata.get('JP_OPEN_AI_Key')
api_key = os.getenv("OPENAI_API_KEY")

# Check if API key is loaded
if not api_key:
    print("Error: No API key found. Please set JP_OPEN_AI_Key in Colab Secrets.")
    exit(1)

# Initialize Open AI client
client = openai.OpenAI(api_key=api_key)

# Read the dataset with robust parsing
try:
    df = pd.read_csv("Complete_BBB_Corpus.csv", quoting=csv.QUOTE_ALL, engine='python', on_bad_lines='skip')
except FileNotFoundError:
    print("Error: Input file 'Complete_BBB_Corpus.csv' not found in the same directory.")
    exit(1)
except Exception as e:
    print(f"Error reading CSV: {e}")
    exit(1)

# Sample 1000 records or all if fewer than 1000
sample_size = min(1000, len(df))
try:
    sampled_df = df.sample(n=sample_size, random_state=42)
except ValueError as e:
    print(f"Error sampling data: {e}")
    exit(1)

# Function to generate sensitivity score using Open AI API
def get_sensitivity_score(complaint_text):
    if not isinstance(complaint_text, str):
        return None, "Error: Complaint text is missing or invalid."

    prompt = f"""
    You are a fraud and customer complaint analyst.
    Evaluate the following customer complaint and assign a severity score between 0.0 and 1.0, reflecting the overall seriousness and business impact of the issue. This score should guide how urgently a company should respond or escalate the complaint.
    Consider the following six factors:
    Financial impact (e.g., significant losses, refunds, overcharges)
    Emotional sensitivity (e.g., tone, distress, hardship expressed)
    Fraud or security concerns (e.g., unauthorized transactions, identity theft)
    Urgency (e.g., impact on essentials like rent, food, time-sensitive problems)
    Customer service breakdown (e.g., repeated failures, no resolution offered)
    Tone or language (e.g., legal threats, accusations, emotionally charged language)
    Score guidelines:
    0.0–0.2: Minor — low impact, no urgency, easily resolvable
    0.3–0.5: Moderate — needs attention but not urgent or damaging
    0.6–0.8: High — serious issue, business should respond quickly
    0.9–1.0: Critical — urgent, high-risk to customer or company (legal, financial, PR)
    Provide the severity score as a number and a brief rationale (1-3 sentences) explaining your reason for given that score.
    Complaint text: "{complaint_text}"
    Response format:
    Score: <number>
    Rationale: <explanation>
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are a fraud and customer complaint analyst."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=150,
            temperature=0.5
        )
        result = response.choices[0].message.content.strip()
        score_line = result.split("\n")[0]
        rationale_line = result.split("\n")[1]
        score = float(score_line.replace("Score: ", ""))
        rationale = rationale_line.replace("Rationale: ", "")
        return score, rationale
    except Exception as e:
        print(f"Error processing complaint: {e}")
        return None, f"API error: {str(e)}"

# Process sampled complaints and collect results
results = []
for index, row in sampled_df.iterrows():
    complaint_text = row['complaint_text']
    row_id = index
    complaint_id = row.get('complaint_id', str(uuid.uuid4()))

    print(f"Processing row_id: {row_id}, complaint_id: {complaint_id}")
    score, rationale = get_sensitivity_score(complaint_text)

    results.append({
        'row_id': row_id,
        'complaint_id': complaint_id,
        'sensitivity_score': score,
        'rationale': rationale
    })

    time.sleep(1)  # Avoid rate limiting

# Create DataFrame from results
results_df = pd.DataFrame(results)

# Save to CSV
try:
    results_df.to_csv("openai_sensitivity_scores.csv", index=False)
    print(f"Output saved to 'openai_sensitivity_scores.csv' with {len(results_df)} records")
except Exception as e:
    print(f"Error saving output: {e}")


'''

# Modeling

## Random Forrest

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    if pd.isna(text):
        return ''
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = nltk.word_tokenize(text)  # Tokenize
    tokens = [word for word in tokens if word not in stop_words]  # Remove stop words
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(lemmatized)

# Load and merge files for training
def load_and_merge_files(file1_path, file2_path, id_column='id', score_column='sensitivity_score'):
    try:
        df1 = pd.read_csv(file1_path, quoting=csv.QUOTE_ALL, engine='python', on_bad_lines='skip')
        df2 = pd.read_csv(file2_path, quoting=csv.QUOTE_ALL, engine='python', on_bad_lines='skip')
    except FileNotFoundError as e:
        print(f"Error: File not found - {e}")
        exit(1)
    except Exception as e:
        print(f"Error reading CSV: {e}")
        exit(1)

    # Merge on the specified ID column
    merged_df = pd.merge(df1, df2, on=id_column, how='inner')

    # Map score to label
    def map_score_to_label(score):
        if 0.0 <= score <= 0.2:
            return 'Minor'
        elif 0.3 <= score <= 0.6:
            return 'Moderate'
        elif 0.7 <= score <= 0.8:
            return 'High'
        elif 0.9 <= score <= 1.0:
            return 'Critical'
        else:
            return 'Invalid'

    merged_df['sensitivity_label'] = merged_df[score_column].apply(map_score_to_label)
    return merged_df, df1  # Return merged training data and full file_1

# Train Random Forest model
def train_random_forest_model(df, text_column='complaint_text', label_column='sensitivity_label'):
    df = df.dropna(subset=[text_column, label_column])
    df[text_column] = df[text_column].apply(clean_text)

    # Remove labels with fewer than 2 samples
    label_counts = df[label_column].value_counts()
    valid_labels = label_counts[label_counts >= 2].index
    df = df[df[label_column].isin(valid_labels)]

    print("Filtered label counts:\n", df[label_column].value_counts())

    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        df[text_column], df[label_column],
        test_size=0.2,
        random_state=42,
        stratify=df[label_column]
    )

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train Random Forest
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train_vec, y_train)

    # Evaluate
    y_pred = model.predict(X_test_vec)
    print("Classification Report on Training Data:\n")
    print(classification_report(y_test, y_pred))

    return model, vectorizer

# Predict labels on full dataset
def predict_labels(full_df, model, vectorizer, text_column='complaint_text'):
    if text_column not in full_df.columns:
        print(f"Error: Column '{text_column}' not found in the dataset.")
        return None

    full_df['cleaned_text'] = full_df[text_column].apply(clean_text)
    X_full = vectorizer.transform(full_df['cleaned_text'])
    full_df['predicted_sensitivity_label'] = model.predict(X_full)
    return full_df

# File paths
file_1 = "Complete_BBB_Corpus.csv"  # Full dataset
file_2 = "openai_sensitivity_scores2.csv"  # Training data with sensitivity scores

# Load and merge for training, get full file_1
train_df, full_df = load_and_merge_files(file_1, file_2)

# Train the model
model, vectorizer = train_random_forest_model(train_df)

# Save model and vectorizer
joblib.dump(model, 'rf_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')
print("Model and vectorizer saved as 'rf_model.pkl' and 'tfidf_vectorizer.pkl'")

# Predict on full file_1 dataset
result_full_df = predict_labels(full_df, model, vectorizer)

# Save or display results
if result_full_df is not None:
    output_path = "predicted_full_complaints.csv"
    result_full_df.to_csv(output_path, index=False)
    print(f"Predictions saved to {output_path}")
    print("\nSample of predictions:")
    print(result_full_df[['complaint_text', 'predicted_sensitivity_label']].head())
else:
    print("Failed to generate predictions.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Filtered label counts:
 sensitivity_label
High        490
Critical    396
Moderate    113
Name: count, dtype: int64
Classification Report on Training Data:

              precision    recall  f1-score   support

    Critical       0.68      0.57      0.62        79
        High       0.59      0.80      0.68        98
    Moderate       1.00      0.04      0.08        23

    accuracy                           0.62       200
   macro avg       0.76      0.47      0.46       200
weighted avg       0.67      0.62      0.59       200

Model and vectorizer saved as 'rf_model.pkl' and 'tfidf_vectorizer.pkl'
Predictions saved to predicted_full_complaints.csv

Sample of predictions:
                                      complaint_text  \
0                                  Suspended account   
1  Subject: Urgent: Fraudulent Activity on My Ven...   
2  I contacted Venmo / Paypal customer support de...   
3  I sold some test strips to Two Mom's Buy Test ...   
4  I just found out my card was tak

## Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

def train_logistic_regression_model(df, text_column='complaint_text', label_column='sensitivity_label'):
    # Drop missing
    df = df.dropna(subset=[text_column, label_column])

    # Clean the text
    df[text_column] = df[text_column].apply(clean_text)

    # Remove labels with fewer than 2 samples
    label_counts = df[label_column].value_counts()
    valid_labels = label_counts[label_counts >= 2].index
    df = df[df[label_column].isin(valid_labels)]

    # Report final label distribution
    print("Filtered label counts:\n", df[label_column].value_counts())

    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(
        df[text_column], df[label_column],
        test_size=0.2,
        random_state=42,
        stratify=df[label_column]
    )

    # TF-IDF vectorization
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    model.fit(X_train_vec, y_train)

    # Evaluate
    y_pred = model.predict(X_test_vec)
    print(classification_report(y_test, y_pred))

    return model, vectorizer


logreg_model, logreg_vectorizer = train_logistic_regression_model(result_df)


Filtered label counts:
 sensitivity_label
High        490
Critical    396
Moderate    113
Name: count, dtype: int64
              precision    recall  f1-score   support

    Critical       0.59      0.52      0.55        79
        High       0.55      0.73      0.63        98
    Moderate       0.00      0.00      0.00        23

    accuracy                           0.56       200
   macro avg       0.38      0.42      0.39       200
weighted avg       0.50      0.56      0.53       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Bert

In [None]:
# pip install --upgrade transformers
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from datasets import Dataset, DatasetDict

def train_bert_model(df, text_column='complaint_text', label_column='sensitivity_label'):
    # Drop missing values
    df = df.dropna(subset=[text_column, label_column])

    # Filter labels with at least 2 samples
    df = df[df[label_column].map(df[label_column].value_counts()) >= 2]

    # Encode labels
    le = LabelEncoder()
    df['label'] = le.fit_transform(df[label_column])

    # Stratified split
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df['label']
    )

    # Convert to Hugging Face Dataset
    train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
    test_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))
    dataset = DatasetDict({"train": train_dataset, "test": test_dataset})

    # Load tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenization function
    def tokenize_function(example):
        return tokenizer(
            example[text_column],
            padding='max_length',
            truncation=True,
            max_length=256
        )

    # Apply tokenization
    dataset = dataset.map(tokenize_function, batched=True)

    # Rename label column for Hugging Face Trainer
    dataset = dataset.rename_column("label", "labels")
    dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # Load model
    model = BertForSequenceClassification.from_pretrained(
        'bert-base-uncased',
        num_labels=len(le.classes_)
    )

    # Training arguments
    training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    num_train_epochs=4,
    logging_dir="./logs"
    )



    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test']
    )

    # Train
    trainer.train()

    # Evaluate
    predictions = trainer.predict(dataset['test'])
    y_pred = predictions.predictions.argmax(axis=1)
    y_true = predictions.label_ids

    print(classification_report(y_true, y_pred, target_names=le.classes_))

    return model, tokenizer, le



In [None]:
bert_model, bert_tokenizer, label_encoder = train_bert_model(result_df)

## Validation Examples

Critical Example - Complaint #48:

on november 17th 2023 i was scammed using venmo  within a minute i realized i was scammed called my bank to stop payment and called venmo to stop payment   they all have record of my call   venmos policy states that this was a friend to friend exchange and they continued to front the 75000    i contacted venmo and followed their directions to file a police report which i did   since then venmo will not close my account as they say i have a negative balance   when someone accidently venmos me money it is just taken off the debt they say i owe   i am only asking venmo to close my account so this doesnt happen any more to my friends and family that think they can pay me by sending money via venmo    i did not lose any of my money as the bank stopped payment on 111723   however i will continue to lose money if they wont close my account

Low Example - Complaint #9
i have been emailing venmo to unfreeze my account they said theyre going to hold my money for 180 days i just want to close that account and get my money back im really frustrated

# Heat Map of Severity Score

In [None]:
import pandas as pd
import csv
import seaborn as sns
import matplotlib.pyplot as plt

# Load the merged_severity.csv file
try:
    df = pd.read_csv("merged_severity.csv")
except FileNotFoundError:
    print("Error: Input file 'merged_severity.csv' not found.")
    exit(1)
except Exception as e:
    print(f"Error reading CSV: {e}")
    exit(1)


# Define a function to create a pivot table with optional percentage calculation
def create_pivot_table(df, index_col, column_col, value_col='id', as_percentage=False):
    if index_col not in df.columns or column_col not in df.columns:
        print(f"Error: One or both of {index_col}, {column_col} not found in DataFrame.")
        return None

    # Create the pivot table with counts
    pivot = df.groupby([index_col, column_col]).size().unstack(fill_value=0)

    if as_percentage:
        # Calculate percentages across the column_col
        pivot = pivot.div(pivot.sum(axis=0), axis=1) * 100
        pivot = pivot.round(2)  # Round to 2 decimal places
    else:
        print(f"\nCount of {index_col} per {column_col}:")

    return pivot

# Define a function to calculate percentages from a pivoted DataFrame (for already pivoted data)
def calculate_percentages(df, index_col='sensitivity_label'):
    # Handle case where index is unnamed
    index_name = df.index.name if df.index.name is not None else None
    if index_name != index_col and index_col in df.columns:
        df = df.set_index(index_col)
    elif index_name != index_col and index_col not in df.columns:
        print(f"Error: {index_col} not found in DataFrame.")
        return None

    # Calculate percentages across each company
    percentage_table = df.div(df.sum(axis=0), axis=1) * 100
    percentage_table = percentage_table.round(2)  # Round to 2 decimal places

    return percentage_table



# Generate the percentage pivot table for sensitivity_label by Company
pivot_table = create_pivot_table(df, index_col='sensitivity_label', column_col='Company', as_percentage=True)

# Create a heatmap using seaborn
if pivot_table is not None:
    plt.figure(figsize=(10, 6))
    sns.heatmap(
        pivot_table,
        annot=True,  # Show percentage values in cells
        fmt='.2f',   # Format numbers to 2 decimal places
        cmap='YlOrRd',  # Color scheme (Yellow-Orange-Red)
        cbar_kws={'label': 'Percentage (%)'}
    )
    plt.title('Percentage of Sensitivity Labels by Company')
    plt.xlabel('Company')
    plt.ylabel('Sensitivity Label')

    # Save the heatmap to a file
    # plt.savefig('heatmap.png', dpi=300, bbox_inches='tight')
    # print("\nHeatmap saved as 'heatmap.png'")

    # Display the plot (optional, depending on environment)
    plt.show()

# Critical Complaints Analysis

### BERTopic for Identifying Department

In [None]:
# !pip install bertopic
# 1. Install necessary packages if not already installed
# !pip install bertopic sentence-transformers openpyxl nltk

# 2. Import libraries
import pandas as pd
import re
import nltk
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# 3. Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

# 4. Load your Excel file
df = pd.read_csv("merged_severity.csv")

# 5. Filter to only include 'Critical' complaints (sensitivity_score ≥ 0.9)
critical_df = df[df['sensitivity_score'] >= 0.9].copy()

# 6. Drop missing or empty complaints
critical_df = critical_df.dropna(subset=['complaint_text'])
critical_df = critical_df[critical_df['complaint_text'].str.strip() != ""]

# 7. Define preprocessing function with keyword removal
def preprocess_text(text):
    # Lowercase
    text = text.lower()

    # Remove target keywords
    keywords_to_remove = ["paypal", "chime", "venmo", "square", "cashapp"]
    for keyword in keywords_to_remove:
        text = text.replace(keyword, "")

    # Remove punctuation and numbers
    text = re.sub(r"[^a-z\s]", "", text)

    # Tokenize
    words = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    # Rejoin into string
    return " ".join(words)

# 8. Apply preprocessing
critical_df["cleaned_complaint"] = critical_df["complaint_text"].apply(preprocess_text)

# 9. Convert text column to list
complaints = critical_df["cleaned_complaint"].tolist()

# 10. Load sentence embedding model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# 11. Initialize BERTopic
topic_model = BERTopic(embedding_model=embedding_model, verbose=True)

# 12. Fit the model to the complaints
topics, probs = topic_model.fit_transform(complaints)

# Replace your model with the reduced version
topic_model = topic_model.reduce_topics(complaints, nr_topics=10)

# 13. Add results back to your DataFrame
critical_df["topic"] = topics
critical_df["probability"] = probs

# 14. Save clustered results (optional)
critical_df.to_excel("critical_complaints_clustered.xlsx", index=False)

# 15. View top topics
print(topic_model.get_topic_info().head())


# 13. Visualize topics (in notebook)
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
# 16. Visualize topics (in notebook)
topic_model.visualize_topics()

In [None]:
# See top keywords for each topic
for topic_id in critical_df['topic'].unique():
    print(f"\nTopic {topic_id}:")
    print(topic_model.get_topic(topic_id))


### Label Mapping

In [None]:
# 1. Define topic label mapping
topic_labels = {
    -1: "General Complaints / Miscellaneous",
     0: "Account Closures & Withheld Funds",
     1: "Disputed Transactions & Denied Claims",
     2: "Held Business Funds & Delayed Releases",
     3: "Hacked Accounts & Stolen Funds",
     4: "Identity Verification & Account Setup Issues",
     5: "Scams Involving Delivery or Sellers",
     6: "Refund Disputes & Fraud Claims"
}

# 2. Apply the mapping to your BERTopic-labeled DataFrame
critical_df["topic_label"] = critical_df["topic"].map(topic_labels)

# 3. Preview the mapping (optional)
print(critical_df[["topic", "topic_label"]].drop_duplicates().sort_values("topic"))

# 4. (Optional) Save updated file
critical_df.to_excel("critical_complaints_labeled.xlsx", index=False)


In [None]:
topic_counts = critical_df["topic_label"].value_counts()
print("\nComplaint counts by topic:")
print(topic_counts)
