In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PMLS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PMLS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\PMLS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
import spacy
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
df = pd.read_csv("goemotions.csv")
df = df.rename(columns={"id": "comment_id"})
df = df[df["example_very_unclear"] == False]

# **Add this line to limit training size**
df = df.head(1000)


In [6]:
print(df)

                                                   text comment_id  \
0                                       That game hurt.    eew5j0j   
2        You do right, if you don't care then fuck 'em!    ed2mah1   
3                                    Man I love reddit.    eeibobj   
4     [NAME] was nowhere near them, he was by the Fa...    eda6yn6   
5     Right? Considering it’s such an important docu...    eespn2i   
...                                                 ...        ...   
1009  Probably overdose in a hotel after losing my v...    eeg7349   
1010  You're such a good troll. Good job! Way to rea...    ee3gfo0   
1011                                 Glad you liked it!    eep717n   
1012                                          GG haha 😉    edn2vyv   
1013  I have a boobs but a penis, it’s quite the emb...    eczunpk   

                   author            subreddit    link_id   parent_id  \
0                   Brdd9                  nrl  t3_ajis4z  t1_eew18eq   
2            

In [7]:
# Cell 3: Generate simplified 'labels' as list
emotion_cols = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion",
    "curiosity", "desire", "disappointment", "disapproval", "disgust", "embarrassment",
    "excitement", "fear", "gratitude", "grief", "joy", "love", "nervousness", "optimism",
    "pride", "realization", "relief", "remorse", "sadness", "surprise", "neutral"
]

def make_labels(row):
    return [emo for emo in emotion_cols if row[emo] == 1]

df["labels"] = df.apply(make_labels, axis=1)

# Remove rows where 'labels' contain "neutral"
df = df[~df["labels"].apply(lambda x: "neutral" in x)]

# Optional sentiment column (positive/negative)
pos = {"admiration", "amusement", "approval", "caring", "desire", "excitement", "gratitude", "joy", "love", "optimism", "pride", "relief"}
neg = {"anger", "annoyance", "disappointment", "disapproval", "disgust", "embarrassment", "fear", "grief", "nervousness", "remorse", "sadness"}

def map_sentiment(lbls):
    return "positive" if set(lbls) & pos else "negative"

df["sentiment"] = df["labels"].apply(map_sentiment)

# Drop 'comment_id' and keep only the necessary columns
df = df[["text", "example_very_unclear", "labels", "sentiment"]]


In [8]:
print(df)

                                                   text  example_very_unclear  \
0                                       That game hurt.                 False   
3                                    Man I love reddit.                 False   
5     Right? Considering it’s such an important docu...                 False   
6     He isn't as big, but he's still quite popular....                 False   
7     That's crazy; I went to a super [RELIGION] hig...                 False   
...                                                 ...                   ...   
1009  Probably overdose in a hotel after losing my v...                 False   
1010  You're such a good troll. Good job! Way to rea...                 False   
1011                                 Glad you liked it!                 False   
1012                                          GG haha 😉                 False   
1013  I have a boobs but a penis, it’s quite the emb...                 False   

                   labels s

In [9]:
df = df[["text", "sentiment"]]


In [10]:
print(df)

                                                   text sentiment
0                                       That game hurt.  negative
3                                    Man I love reddit.  positive
5     Right? Considering it’s such an important docu...  positive
6     He isn't as big, but he's still quite popular....  negative
7     That's crazy; I went to a super [RELIGION] hig...  positive
...                                                 ...       ...
1009  Probably overdose in a hotel after losing my v...  positive
1010  You're such a good troll. Good job! Way to rea...  positive
1011                                 Glad you liked it!  positive
1012                                          GG haha 😉  positive
1013  I have a boobs but a penis, it’s quite the emb...  negative

[752 rows x 2 columns]


In [11]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [12]:
def preprocess_text(text):
    """
    Cleans and preprocesses a single text string.
    1. Removes HTML tags
    2. Lowercases text
    3. Removes punctuation and numbers
    4. Tokenizes text
    5. Removes stop words
    6. Lemmatizes words
    """
    text = re.sub(r'<.*?>', '', text)
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization is often better than stemming
    text = ' '.join(tokens)
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    
    return ' '.join(lemmatized_tokens)

In [13]:
import re
from nltk.tokenize import word_tokenize
print("preprocessing all reviews .....")
df['cleaned_text'] = df['text'].apply(preprocess_text)
print("Preprcessing Complete!")
print("\nComparing Orignal vs Cleaned Text:")
display(df[['text','cleaned_text']].head())

preprocessing all reviews .....
Preprcessing Complete!

Comparing Orignal vs Cleaned Text:


Unnamed: 0,text,cleaned_text
0,That game hurt.,game hurt
3,Man I love reddit.,man love reddit
5,Right? Considering it’s such an important docu...,right consider important document know damned ...
6,"He isn't as big, but he's still quite popular....",be not big he s still quite popular I ve hear ...
7,That's crazy; I went to a super [RELIGION] hig...,that s crazy go super religion high school thi...


In [14]:
df = df[["cleaned_text", "sentiment"]]


In [15]:
train, test = train_test_split(df, test_size=0.1, random_state=42, stratify=df["sentiment"])
train.to_csv("goemotions_train.csv", index=False)
test.to_csv("goemotions_test.csv", index=False)

In [16]:
print(df)

                                           cleaned_text sentiment
0                                             game hurt  negative
3                                       man love reddit  positive
5     right consider important document know damned ...  positive
6     be not big he s still quite popular I ve hear ...  negative
7     that s crazy go super religion high school thi...  positive
...                                                 ...       ...
1009  probably overdose hotel lose virginity escort ...  positive
1010  you re good troll good job way really get part...  positive
1011                                          glad like  positive
1012                                            gg haha  positive
1013                    boobs penis quite embarrassment  negative

[752 rows x 2 columns]


In [19]:
# Cell 5: Setup Tokenizer and Model
model_name = "prajjwal1/bert-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_name)
mlb = MultiLabelBinarizer(classes=emotion_cols)

def preprocess(df):
    enc = tokenizer(df["cleaned_text"].tolist(), padding=True, truncation=True, max_length=128)
    sentiment = mlb.fit_transform(df["sentiment"])
    return enc, sentiment

train_enc, train_lbl = preprocess(train)
test_enc, test_lbl = preprocess(test)




In [20]:
# Cell 6: Build Dataset Class
class EmoDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, sentiment):
        self.encodings = encodings
        self.sentiment = sentiment
    def __len__(self): return len(self.sentiment)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["sentiment"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

train_dataset = EmoDataset(train_enc, train_lbl)
test_dataset = EmoDataset(test_enc, test_lbl)

In [22]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score
import torch

# Updated model initialization for binary classification
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, problem_type="single_label_classification"
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./emotions_out",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1,
    save_total_limit=1,
    logging_steps=100,
    load_best_model_at_end=False,
    dataloader_num_workers=2
)

# Updated compute_metrics for binary classification
def compute_metrics(pred):
    logits, labels = pred
    preds = torch.argmax(torch.tensor(logits), dim=1).numpy()
    true = labels
    micro = f1_score(true, preds, average="micro")
    macro = f1_score(true, preds, average="macro")
    return {"micro_f1": micro, "macro_f1": macro}

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Cell 8: Train and Evaluate
trainer.train()
trainer.evaluate()



In [None]:
# Cell 9: Inference Example
def predict(text):
    enc = tokenizer([text], truncation=True, padding=True, max_length=128, return_tensors="pt")
    outputs = model(**enc)
    probs = torch.sigmoid(outputs.logits)[0].detach().numpy()
    labels_pred = [emotion_cols[i] for i, p in enumerate(probs) if p >= 0.5]
    return labels_pred

print(predict("I really love your help, thanks so much!"))
