In [25]:
# Import necessary libraries
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.optim as optim
import string
from imblearn.over_sampling import RandomOverSampler

In [3]:
data=pd.read_csv('D:\projects\Gen_Flask\YoutubeCommentsDataSet.csv')
data.head()

Unnamed: 0,Comment,Sentiment
0,lets not forget that apple pay in 2014 require...,neutral
1,here in nz 50 of retailers don’t even have con...,negative
2,i will forever acknowledge this channel with t...,positive
3,whenever i go to a place that doesn’t take app...,negative
4,apple pay is so convenient secure and easy to ...,positive


In [4]:
print(data.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18408 entries, 0 to 18407
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Comment    18364 non-null  object
 1   Sentiment  18408 non-null  object
dtypes: object(2)
memory usage: 287.8+ KB
None


In [5]:
# Drop rows with missing comments
data = data.dropna(subset=['Comment'])
print("After dropping missing comments:", data.shape)

After dropping missing comments: (18364, 2)


In [6]:
import re

def preprocess_text(text):
    text = text.lower()  # normalize text
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r'\W', ' ', text)  # remove non-word characters
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    return text

# Apply preprocessing on the Comment column
data['clean_text'] = data['Comment'].apply(preprocess_text)
print(data[['Comment', 'clean_text']].head())


                                             Comment  \
0  lets not forget that apple pay in 2014 require...   
1  here in nz 50 of retailers don’t even have con...   
2  i will forever acknowledge this channel with t...   
3  whenever i go to a place that doesn’t take app...   
4  apple pay is so convenient secure and easy to ...   

                                          clean_text  
0  lets not forget that apple pay in 2014 require...  
1  here in nz 50 of retailers don t even have con...  
2  i will forever acknowledge this channel with t...  
3  whenever i go to a place that doesn t take app...  
4  apple pay is so convenient secure and easy to ...  


In [7]:
import transformers
import accelerate

print(transformers.__version__)
print(accelerate.__version__)


4.49.0
1.4.0


In [9]:
from transformers import pipeline

# Create the pipeline for emotion detection
emotion_classifier = pipeline("text-classification", model="nateraw/bert-base-uncased-emotion")


pytorch_model.bin:  17%|#6        | 73.4M/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cuda:0


In [10]:
def get_emotion(text):
    try:
        result = emotion_classifier(text)
        # Taking the top prediction's label
        return result[0]['label']
    except Exception as e:
        # If something goes wrong (maybe text is too short or any other issue)
        return "unknown"

# Apply the classifier to our clean text column
data['emotion'] = data['clean_text'].apply(get_emotion)
print(data[['clean_text', 'emotion']].head())

  attn_output = torch.nn.functional.scaled_dot_product_attention(
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Token indices sequence length is longer than the specified maximum sequence length for this model (567 > 512). Running this sequence through the model will result in indexing errors


                                          clean_text emotion
0  lets not forget that apple pay in 2014 require...     joy
1  here in nz 50 of retailers don t even have con...     joy
2  i will forever acknowledge this channel with t...     joy
3  whenever i go to a place that doesn t take app...   anger
4  apple pay is so convenient secure and easy to ...     joy


In [11]:
# Save the updated DataFrame to a new CSV file
data.to_csv("YoutubeCommentsDataSet_with_emotion.csv", index=False)
print("CSV file saved as 'YoutubeCommentsDataSet_with_emotion.csv'")

CSV file saved as 'YoutubeCommentsDataSet_with_emotion.csv'


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
data=pd.read_csv("D:\projects\Gen_Flask\services\YoutubeCommentsDataSet_with_emotion.csv")


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18364 entries, 0 to 18363
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Comment     18364 non-null  object
 1   Sentiment   18364 non-null  object
 2   clean_text  18364 non-null  object
 3   emotion     18364 non-null  object
dtypes: object(4)
memory usage: 574.0+ KB


In [18]:
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['clean_text'])

In [19]:
y = data['emotion']

In [20]:
# 3. Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
# Use RandomOverSampler to balance the training data
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)


In [27]:
print("Original training set shape:", X_train.shape)
print("Resampled training set shape:", X_train_resampled.shape)

Original training set shape: (14691, 5000)
Resampled training set shape: (75950, 5000)


In [28]:
from sklearn.neural_network import MLPClassifier
# 4. Train the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', 
                    max_iter=1000, random_state=42)
mlp.fit(X_train_resampled, y_train_resampled)



In [29]:
from sklearn.metrics import classification_report, accuracy_score
y_pred = mlp.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.732915872583719
Classification Report:
              precision    recall  f1-score   support

       anger       0.33      0.41      0.36       380
        fear       0.37      0.30      0.33       100
         joy       0.86      0.84      0.85      2766
        love       0.34      0.41      0.37        83
     sadness       0.38      0.37      0.38       228
    surprise       0.60      0.48      0.53       109
     unknown       0.40      0.29      0.33         7

    accuracy                           0.73      3673
   macro avg       0.47      0.44      0.45      3673
weighted avg       0.74      0.73      0.74      3673

