## Data Pre-Processing

### Colab Mount

In [None]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/'Colab_Notebooks'/LLM-Project-DS/
!ls

In [None]:
# !pip install datasets # Colab needs this...

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

### Etc.

In [53]:
# Imports
import os
import pandas as pd
import datasets
from datasets import load_dataset

## YELP Original Dataset

In [54]:
dataset = load_dataset('yelp_review_full')

dataset # View

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [55]:
# Inspect
dataset['train'][0]

{'label': 4,
 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank."}

In [56]:
# Check Features
dataset['train'].features

{'label': ClassLabel(names=['1 star', '2 star', '3 stars', '4 stars', '5 stars'], id=None),
 'text': Value(dtype='string', id=None)}

In [57]:
train = pd.DataFrame(dataset['train'])
test = pd.DataFrame(dataset['test'])

display(train.head()), display(test.head())

Unnamed: 0,label,text
0,4,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...
4,0,I don't know what Dr. Goldberg was like before...


Unnamed: 0,label,text
0,0,I got 'new' tires from them and within two wee...
1,0,Don't waste your time. We had two different p...
2,0,All I can say is the worst! We were the only 2...
3,0,I have been to this restaurant twice and was d...
4,0,Food was NOT GOOD at all! My husband & I ate h...


(None, None)

### EDA

In [58]:
# Shape
train.shape, test.shape

((650000, 2), (50000, 2))

In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650000 entries, 0 to 649999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   650000 non-null  int64 
 1   text    650000 non-null  object
dtypes: int64(1), object(1)
memory usage: 9.9+ MB


In [60]:
# Check for missing values
print(train.isnull().sum())

label    0
text     0
dtype: int64


In [61]:
# Check class distribution
print(train['label'].value_counts(normalize=True))

label
4    0.2
1    0.2
3    0.2
0    0.2
2    0.2
Name: proportion, dtype: float64


### Text Preprocessing

In [69]:
# Imports
import re
import string
import nltk

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [70]:
# Downloads
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kadm2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\kadm2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [71]:
# Define punctuation to keep
keep_punctuation = {".", ",", "!", "?", "'"}

# Cleaning function
def clean_text(review):
    review = review.lower()  # Lowercase

    # Remove unwanted characters (keep only letters, numbers, and whitelisted punctuation)
    cleaned_text = "".join(char if char.isalnum() or char in keep_punctuation or char.isspace() else " " for char in review)

    # Remove extra spaces (from removed characters)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    return cleaned_text

In [72]:
#train.drop(columns=['clean_text'], inplace=True)
#test.drop(columns=['clean_text'], inplace=True)

#train.shape, test.shape

> I'm not removing punctuations or stop words as it may affect word meanings, and therefore, the sentiment.

In [73]:
# Apply to train & test data
train['clean_text'] = train['text'].apply(clean_text)
test['clean_text'] = test['text'].apply(clean_text)

train.head()

Unnamed: 0,label,text,clean_text
0,4,dr. goldberg offers everything i look for in a...,dr. goldberg offers everything i look for in a...
1,1,"Unfortunately, the frustration of being Dr. Go...","unfortunately, the frustration of being dr. go..."
2,3,Been going to Dr. Goldberg for over 10 years. ...,been going to dr. goldberg for over 10 years. ...
3,3,Got a letter in the mail last week that said D...,got a letter in the mail last week that said d...
4,0,I don't know what Dr. Goldberg was like before...,i don't know what dr. goldberg was like before...


#### Saving

In [None]:
# Create the directory if it doesn't exist
output_path = "../data/pre-processed/"
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [75]:
# Save train and test... we're moving to another PC...
train.to_csv(output_path + 'train.csv', index=False)
test.to_csv(output_path + 'test.csv', index=False)

if output_path + 'train.csv':
    print("Train data saved successfully.")
if output_path + 'test.csv':
    print("Test data saved successfully.")

Train data saved successfully.
Test data saved successfully.


## Loading the Model and Tokenizer for Classification

> We'll tokenize the cleaned text to prepare it for input into SamLowe's `roberta-base-go-emotions` model.

In [89]:
# Load Data
train = pd.read_csv(output_path + 'train.csv')
test = pd.read_csv(output_path + 'test.csv')

train.shape, test.shape

((650000, 3), (50000, 3))

#### Cleaning...

In [90]:
train['clean_text'].apply(lambda x: type(x)).value_counts(), test['clean_text'].apply(lambda x: type(x)).value_counts()

(clean_text
 <class 'str'>      649987
 <class 'float'>        13
 Name: count, dtype: int64,
 clean_text
 <class 'str'>    50000
 Name: count, dtype: int64)

In [91]:
weird_ones = train[train['clean_text'].apply(lambda x: type(x) != str)]

weird_ones

Unnamed: 0,label,text,clean_text
59007,0,:),
78867,0,:(,
146853,1,:/,
161719,1,: /,
193761,4,:-),
196630,0,: (,
342769,3,----------------------,
342771,3,------------,
366520,0,$$$,
375693,0,_,


In [92]:
# Drop weird ones
drop_indices = weird_ones.index

train.drop(index=drop_indices, inplace=True)
train['clean_text'].apply(lambda x: type(x)).value_counts()

clean_text
<class 'str'>    649987
Name: count, dtype: int64

#### Initializing the Model

In [93]:
# Imports
import time
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [94]:
# Clear CUDA cache
torch.cuda.empty_cache()

In [95]:
# Load SamLowe's GoEmotions model
model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [96]:
# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # I did install cuda for this
model.to(device) 
model.eval()  # Set to evaluation mode

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

#### Tokenize

In [97]:
# Tokenize the text
def tokenize_text(text):
    return tokenizer(text, 
                     padding="max_length", 
                     truncation=True, 
                     max_length=64, return_tensors="pt")

# Example check
example = train['clean_text'].iloc[0]
tokenized_example = tokenize_text(example)

print("Original Text:", example)
print("Tokenized Output:", tokenized_example)

Original Text: dr. goldberg offers everything i look for in a general practitioner. he's nice and easy to talk to without being patronizing he's always on time in seeing his patients he's affiliated with a top notch hospital nyu which my parents have explained to me is very important in case something happens and you need surgery and you can get referrals to see specialists without having to see him first. really, what more do you need? i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.
Tokenized Output: {'input_ids': tensor([[    0, 10232,     4,  1637,  2865,  1523,   960,   939,   356,    13,
            11,    10,   937, 29764,     4,    37,    18,  2579,     8,  1365,
             7,  1067,     7,   396,   145, 18528,  2787,    37,    18,   460,
            15,    86,    11,  1782,    39,  1484,    37,    18, 13778,    19,
            10,   299, 16046,  1098,   295, 29159,    61,   127,  1041,    33,
          2002,     7,   162,  

In [98]:
# Print the current GPU being used
current_device = torch.cuda.current_device()
print(f"Using GPU: {current_device} - {torch.cuda.get_device_name(current_device)}")

Using GPU: 0 - NVIDIA GeForce RTX 3060


In [99]:
# Load label names
emotion_labels = ["admiration", "amusement", "anger", "annoyance",
                  "approval", "caring", "confusion", "curiosity", 
                  "desire", "disappointment", "disapproval", "disgust",
                  "embarrassment", "excitement", "fear", "gratitude",
                  "grief", "joy", "love", "nervousness", "optimism",
                  "pride", "realization", "relief", "remorse",
                  "sadness", "surprise", "neutral"]

In [100]:
# Function to get top 3 emotions
def get_emotion_label(text):
    # Tokenize text
    tokens = tokenizer(text,
                       padding=True,
                       truncation=True,
                       max_length=128,
                       return_tensors="pt").to(model.device)

    # Run inference
    with torch.no_grad():
        outputs = model(**tokens)

    # Get top 3 predicted labels
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)  # Convert logits to probabilities
    top3_indices = torch.argsort(probabilities, descending=True)[0][:3]  # Get top 3 indices
    
    # Convert indices to emotion labels
    top3_emotions = [emotion_labels[i] for i in top3_indices]
    
    # Return as string...
    return ", ".join(top3_emotions)

#### Benchmark a Small Batch (Before Processing Everything)

In [101]:
import time
from torch.cuda.amp import autocast

# Select a batch size
BATCH_SIZE = 32

# Sample 1,000 reviews
sample_texts = train["text"][:1000].tolist()

# Tokenize & measure time
start_time = time.time()
inputs = tokenizer(sample_texts, 
                   padding=True, 
                   truncation=True, 
                   max_length=128, 
                   return_tensors="pt").to(device)

with torch.no_grad():
    with autocast():
        outputs = model(**inputs)

end_time = time.time()

# Estimate full processing time
time_per_1000 = end_time - start_time
estimated_full_time = (700_000 / 1000) * time_per_1000

# Estimate full processing time
print(f"Time for 1,000 reviews: {time_per_1000:.2f} sec")
print(f"Estimated full runtime: {estimated_full_time / 3600:.2f} hours")

# Monitor VRAM usage
print(torch.cuda.memory_allocated(device) / 1e9, "GB allocated")
print(torch.cuda.memory_reserved(device) / 1e9, "GB reserved")

Time for 1,000 reviews: 0.54 sec
Estimated full runtime: 0.11 hours
1.01037056 GB allocated
5.425332224 GB reserved


### Running the full inference...

In [102]:
get_emotion_label(sample_texts[0])

'neutral, curiosity, approval'

In [104]:
# Loop through the train data
for i, row in train.iterrows():
  train.at[i, "emotions"] = get_emotion_label(row["clean_text"])

In [105]:
display(train.head())

Unnamed: 0,label,text,clean_text,emotions
0,4,dr. goldberg offers everything i look for in a...,dr. goldberg offers everything i look for in a...,"curiosity, neutral, confusion"
1,1,"Unfortunately, the frustration of being Dr. Go...","unfortunately, the frustration of being dr. go...","disappointment, remorse, sadness"
2,3,Been going to Dr. Goldberg for over 10 years. ...,been going to dr. goldberg for over 10 years. ...,"admiration, neutral, approval"
3,3,Got a letter in the mail last week that said D...,got a letter in the mail last week that said d...,"neutral, disappointment, annoyance"
4,0,I don't know what Dr. Goldberg was like before...,i don't know what dr. goldberg was like before...,"annoyance, disapproval, neutral"


In [106]:
# Loop through the test data
for i, row in test.iterrows():
  test.at[i, "emotions"] = get_emotion_label(row["clean_text"])

In [107]:
display(test.head())

Unnamed: 0,label,text,clean_text,emotions
0,0,I got 'new' tires from them and within two wee...,i got 'new' tires from them and within two wee...,"neutral, confusion, disapproval"
1,0,Don't waste your time. We had two different p...,don't waste your time. we had two different pe...,"neutral, annoyance, disapproval"
2,0,All I can say is the worst! We were the only 2...,all i can say is the worst! we were the only 2...,"disgust, annoyance, neutral"
3,0,I have been to this restaurant twice and was d...,i have been to this restaurant twice and was d...,"disappointment, annoyance, disapproval"
4,0,Food was NOT GOOD at all! My husband & I ate h...,food was not good at all! my husband i ate her...,"disapproval, neutral, disappointment"


In [118]:
train.shape, test.shape

((649987, 4), (50000, 4))

In [119]:
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 649987 entries, 0 to 649999
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   label       649987 non-null  int64 
 1   text        649987 non-null  object
 2   clean_text  649987 non-null  object
 3   emotions    649987 non-null  object
dtypes: int64(1), object(3)
memory usage: 40.9+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       50000 non-null  int64 
 1   text        50000 non-null  object
 2   clean_text  50000 non-null  object
 3   emotions    50000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


(None, None)

#### Saving

In [128]:
# Create the directory if it doesn't exist
output_data = "../data/emotions/"
if not os.path.exists(output_data):
    os.makedirs(output_data)

In [None]:
# Save train and test... we're moving to another PC...
train.to_csv(output_data + 'train_classified.csv', index=False)
test.to_csv(output_data + 'test_classified.csv', index=False)

if output_data + 'train_classified.csv':
    print("Train data with emotions classified saved successfully.")
if output_data + 'test_classified.csv':
    print("Test data with emotions classified saved successfully.")

## Prepping for API Calls

In [132]:
train_e = pd.read_csv(output_data + 'train_classified.csv')
test_e = pd.read_csv(output_data + 'test_classified.csv')

train_e.shape, test_e.shape

((649987, 4), (50000, 4))

### Prep & Inspection

In [184]:
train.head()

Unnamed: 0,label,text,clean_text,emotions
0,4,dr. goldberg offers everything i look for in a...,dr. goldberg offers everything i look for in a...,"curiosity, neutral, confusion"
1,1,"Unfortunately, the frustration of being Dr. Go...","unfortunately, the frustration of being dr. go...","disappointment, remorse, sadness"
2,3,Been going to Dr. Goldberg for over 10 years. ...,been going to dr. goldberg for over 10 years. ...,"admiration, neutral, approval"
3,3,Got a letter in the mail last week that said D...,got a letter in the mail last week that said d...,"neutral, disappointment, annoyance"
4,0,I don't know what Dr. Goldberg was like before...,i don't know what dr. goldberg was like before...,"annoyance, disapproval, neutral"


In [185]:
display(train_e.head()), display(test_e.head())

Unnamed: 0,label,text,clean_text,emotions
0,4,dr. goldberg offers everything i look for in a...,dr. goldberg offers everything i look for in a...,"curiosity, neutral, confusion"
1,1,"Unfortunately, the frustration of being Dr. Go...","unfortunately, the frustration of being dr. go...","disappointment, remorse, sadness"
2,3,Been going to Dr. Goldberg for over 10 years. ...,been going to dr. goldberg for over 10 years. ...,"admiration, neutral, approval"
3,3,Got a letter in the mail last week that said D...,got a letter in the mail last week that said d...,"neutral, disappointment, annoyance"
4,0,I don't know what Dr. Goldberg was like before...,i don't know what dr. goldberg was like before...,"annoyance, disapproval, neutral"


Unnamed: 0,label,text,clean_text,emotions
0,0,I got 'new' tires from them and within two wee...,i got 'new' tires from them and within two wee...,"neutral, confusion, disapproval"
1,0,Don't waste your time. We had two different p...,don't waste your time. we had two different pe...,"neutral, annoyance, disapproval"
2,0,All I can say is the worst! We were the only 2...,all i can say is the worst! we were the only 2...,"disgust, annoyance, neutral"
3,0,I have been to this restaurant twice and was d...,i have been to this restaurant twice and was d...,"disappointment, annoyance, disapproval"
4,0,Food was NOT GOOD at all! My husband & I ate h...,food was not good at all! my husband i ate her...,"disapproval, neutral, disappointment"


(None, None)

In [186]:
train_e.info(), test_e.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 649987 entries, 0 to 649986
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   label       649987 non-null  int64 
 1   text        649987 non-null  object
 2   clean_text  649987 non-null  object
 3   emotions    649987 non-null  object
dtypes: int64(1), object(3)
memory usage: 19.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   label       50000 non-null  int64 
 1   text        50000 non-null  object
 2   clean_text  50000 non-null  object
 3   emotions    50000 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.5+ MB


(None, None)

In [187]:
# Check dtypes...
for col in train_e:
  print('===\n', train_e[col].apply(lambda x: type(x)).value_counts())

print('\n\n')

for col in test_e:
  print('===\n', test_e[col].apply(lambda x: type(x)).value_counts())

===
 label
<class 'int'>    649987
Name: count, dtype: int64
===
 text
<class 'str'>    649987
Name: count, dtype: int64
===
 clean_text
<class 'str'>    649987
Name: count, dtype: int64
===
 emotions
<class 'str'>    649987
Name: count, dtype: int64



===
 label
<class 'int'>    50000
Name: count, dtype: int64
===
 text
<class 'str'>    50000
Name: count, dtype: int64
===
 clean_text
<class 'str'>    50000
Name: count, dtype: int64
===
 emotions
<class 'str'>    50000
Name: count, dtype: int64


In [188]:
train_e_prep = train_e.copy()
test_e_prep = test_e.copy()

In [189]:
# Map labels as 1-5
train_e_prep['label'] = train_e_prep['label'].map({0:1, 1:2, 2:3, 3:4, 4:5})
test_e_prep['label'] = test_e_prep['label'].map({0:1, 1:2, 2:3, 3:4, 4:5})

# Input column
train_e_prep['input'] = train_e['clean_text'] + '\nEmotions conveyed: ' + train_e['emotions'] + '\nUser left a ' + train_e_prep['label'].astype(str) + '/5 star review.'
test_e_prep['input'] = test_e['clean_text'] + '\nEmotions conveyed: ' + test_e['emotions'] + '\nUser left a ' + test_e_prep['label'].astype(str) + '/5 star review.'

In [190]:
train_e_prep['label'].isnull().sum(), test_e_prep['label'].isnull().sum()

(0, 0)

In [191]:
# Check
print(train_e_prep['input'][0])
print(test_e_prep['input'][0])

dr. goldberg offers everything i look for in a general practitioner. he's nice and easy to talk to without being patronizing he's always on time in seeing his patients he's affiliated with a top notch hospital nyu which my parents have explained to me is very important in case something happens and you need surgery and you can get referrals to see specialists without having to see him first. really, what more do you need? i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.
Emotions conveyed: curiosity, neutral, confusion
User left a 5/5 star review.
i got 'new' tires from them and within two weeks got a flat. i took my car to a local mechanic to see if i could get the hole patched, but they said the reason i had a flat was because the previous patch had blown wait, what? i just got the tire and never needed to have it patched? this was supposed to be a new tire. ni took the tire over to flynn's and they told me that someone punctured my 

In [192]:
# Drop clean_text, emotions
train_e_prep.drop(columns=['label', 'clean_text', 'emotions'], inplace=True)
test_e_prep.drop(columns=['label', 'clean_text', 'emotions'], inplace=True)

train_e_prep.shape, test_e_prep.shape

((649987, 2), (50000, 2))

### Saving

In [193]:
# Create the directory if it doesn't exist
output_prep = "../data/input/"
if not os.path.exists(output_prep):
    os.makedirs(output_prep)

In [194]:
# Save train and test... we're moving to another PC...
train_e_prep.to_csv(output_prep + 'train_input.csv', index=False)
test_e_prep.to_csv(output_prep + 'test_input.csv', index=False)

if output_prep + 'train_input.csv':
    print("Train data as input saved successfully.")
if output_prep + 'test_input.csv':
    print("Test data as input saved successfully.")

Train data as input saved successfully.
Test data as input saved successfully.
