In [1]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Imports
import os
import re
import math
import string
import operator
import pandas as pd
from collections import Counter, defaultdict
import numpy as np

In [4]:
# File paths
DATA_DIR = "/content/drive/MyDrive/266_final_project/data/full_dataset"
EMOTION_FILE = "/content/drive/MyDrive/266_final_project/data/emotions.txt"
OUTPUT_CSV = "/content/drive/MyDrive/266_final_project/emotion_words.csv"

In [5]:
# Text preprocessing
punct_chars = list((set(string.punctuation) | {
    "’", "‘", "–", "—", "~", "|", "“", "”", "…", "'", "`", "_", "“"
}) - set(["#"]))
punct_chars.sort()
punctuation = "".join(punct_chars)
replace = re.compile("[%s]" % re.escape(punctuation))

def CleanText(text):
    if isinstance(text, float):
        return []
    text = text.lower()
    text = re.sub(r"http\S*|\S*\.com\S*|\S*www\S*", " ", text)
    text = re.sub(r"\s@\S+", " ", text)
    text = replace.sub(" ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return [w for w in text.split() if len(w) > 2]

In [6]:
# Agreement and word count helpers
def CheckAgreement(ex, min_agreement, all_emotions, max_agreement=100):
    sum_ratings = ex[all_emotions].sum(axis=0)
    agreement = (sum_ratings >= min_agreement) & (sum_ratings <= max_agreement)
    return ",".join(sum_ratings.index[agreement].tolist())

def GetCounts(df):
    words = []
    for t in df["text"]:
        words.extend(t)
    return Counter(words)

def LogOdds(counts1, counts2, prior, zscore=True):
    sigmasquared = defaultdict(float)
    sigma = defaultdict(float)
    delta = defaultdict(float)
    n1, n2 = sum(counts1.values()), sum(counts2.values())
    nprior = sum(prior.values())

    for word in prior:
        if prior[word] == 0:
            delta[word] = 0
            continue
        l1 = (counts1[word] + prior[word]) / ((n1 + nprior) - (counts1[word] + prior[word]))
        l2 = (counts2[word] + prior[word]) / ((n2 + nprior) - (counts2[word] + prior[word]))
        sigmasquared[word] = 1 / (counts1[word] + prior[word]) + 1 / (counts2[word] + prior[word])
        sigma[word] = math.sqrt(sigmasquared[word])
        delta[word] = math.log(l1) - math.log(l2)
        if zscore:
            delta[word] /= sigma[word]
    return delta

In [7]:
# Load dataset
print("Loading data...")
dfs = [pd.read_csv(os.path.join(DATA_DIR, f)) for f in os.listdir(DATA_DIR) if f.endswith(".csv")]
data = pd.concat(dfs)
print(f"{len(set(data['id']))} unique examples, {len(data)} annotations")

# Load emotions
with open(EMOTION_FILE, "r") as f:
    all_emotions = f.read().splitlines()
print(f"{len(all_emotions)} emotion categories")

# Clean text
print("Processing text...")
data["text"] = data["text"].apply(CleanText)

# Compute agreement labels
agree_dict = data.groupby("id").apply(CheckAgreement, 2, all_emotions).to_dict()
data["agreement"] = data["id"].map(agree_dict)
data = data[~data["agreement"].isnull()]

Loading data...
58011 unique examples, 211225 annotations
28 emotion categories
Processing text...


  agree_dict = data.groupby("id").apply(CheckAgreement, 2, all_emotions).to_dict()


In [8]:
# Extract top words per emotion
dicts = []

for e in all_emotions:
    print(f"Processing emotion: {e}")
    contains = data["agreement"].str.contains(e)
    emotion_words = GetCounts(data[contains])
    other_words = GetCounts(data[~contains])
    prior = Counter(emotion_words) + Counter(other_words)
    emotion_words_total = sum(emotion_words.values())

    delta = LogOdds(emotion_words, other_words, prior, zscore=True)

    c = 0
    for k, v in sorted(delta.items(), key=operator.itemgetter(1), reverse=True):
        if v < 3:
            continue
        dicts.append({
            "emotion": e,
            "word": k,
            "odds": "%.2f" % v,
            "freq": "%.3f" % (emotion_words[k] / emotion_words_total)
        })
        c += 1
        if c < 11:
            print(f"{k} ({v:.2f})")
    print("--------")

Processing emotion: admiration
great (41.80)
awesome (31.18)
amazing (29.65)
good (26.87)
beautiful (22.87)
nice (20.57)
appreciate (17.56)
cute (17.55)
best (17.26)
pretty (13.86)
--------
Processing emotion: amusement
lol (65.47)
haha (32.19)
funny (26.68)
lmao (20.18)
hilarious (18.42)
fun (16.92)
hahaha (15.62)
laugh (14.26)
laughed (12.98)
joke (12.25)
--------
Processing emotion: anger
fuck (23.22)
hate (17.37)
fucking (16.97)
angry (10.82)
dare (10.25)
shut (7.60)
stupid (7.21)
hell (6.67)
idiot (6.37)
asshole (6.14)
--------
Processing emotion: annoyance
annoying (14.32)
stupid (12.72)
fucking (11.77)
shit (9.11)
dumb (8.69)
frustrating (8.06)
idiot (7.88)
annoyed (7.66)
fuck (7.54)
damn (7.38)
--------
Processing emotion: approval
agree (24.39)
not (12.37)
don (11.42)
agreed (11.37)
true (11.16)
yes (11.12)
yeah (8.46)
disagree (8.35)
right (7.73)
doesn (7.13)
--------
Processing emotion: caring
you (11.26)
worry (10.87)
careful (8.91)
stay (8.48)
your (7.99)
bless (7.97)
your

In [9]:
# Save to CSV
emotion_words_df = pd.DataFrame(dicts)
emotion_words_df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")
print(f"\nSaved top emotion words to {OUTPUT_CSV}")


Saved top emotion words to /content/drive/MyDrive/266_final_project/emotion_words.csv
