<a href="https://colab.research.google.com/github/juliocesaria/emotion-detection-nlp/blob/main/02_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Preprocessing â€“ GoEmotions

This notebook handles text preprocessing for the GoEmotions dataset.
Steps include:
- Text cleaning
- Lowercasing
- Removing punctuation
- Preparing data for modeling


In [16]:
print("Preprocessing notebook initialized")


Preprocessing notebook initialized


In [17]:
!pip -q install datasets pandas

import re
import pandas as pd
from datasets import load_dataset


In [18]:
dataset = load_dataset("go_emotions")
train_df = pd.DataFrame(dataset["train"])
val_df   = pd.DataFrame(dataset["validation"])
test_df  = pd.DataFrame(dataset["test"])

print(train_df.shape, val_df.shape, test_df.shape)
train_df.head()


(43410, 3) (5426, 3) (5427, 3)


Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj


In [19]:
def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+|www\.\S+", "", text)     # links
    text = re.sub(r"@\w+", "", text)                # mentions
    text = re.sub(r"#\w+", "", text)                # hashtags
    text = re.sub(r"[^a-z\s']", " ", text)          # keep letters + apostrophe
    text = re.sub(r"\s+", " ", text).strip()        # extra spaces
    return text


In [20]:
for df in [train_df, val_df, test_df]:
    df["text_clean"] = df["text"].apply(clean_text)
    df["len_clean"] = df["text_clean"].apply(len)

train_df[["text", "text_clean", "len_clean", "labels"]].head()


Unnamed: 0,text,text_clean,len_clean,labels
0,My favourite food is anything I didn't have to...,my favourite food is anything i didn't have to...,58,[27]
1,"Now if he does off himself, everyone will thin...",now if he does off himself everyone will think...,111,[27]
2,WHY THE FUCK IS BAYLESS ISOING,why the fuck is bayless isoing,30,[2]
3,To make her feel threatened,to make her feel threatened,27,[14]
4,Dirty Southern Wankers,dirty southern wankers,22,[3]


In [21]:
before = len(train_df)
train_df = train_df[train_df["text_clean"].str.len() > 0].reset_index(drop=True)
after = len(train_df)

print("Removed empty rows:", before - after)


Removed empty rows: 8


In [22]:
import os
os.makedirs("data/processed", exist_ok=True)

train_df.to_csv("data/processed/train_clean.csv", index=False)
val_df.to_csv("data/processed/val_clean.csv", index=False)
test_df.to_csv("data/processed/test_clean.csv", index=False)

print("Saved:", os.listdir("data/processed"))


Saved: ['val_clean.csv', 'train_clean.csv', 'test_clean.csv']
