<a href="https://colab.research.google.com/github/kayannet/Yt_Comments_Sentiment_Analysis/blob/main/youtube_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")

import nltk
from nltk.corpus import stopwords
import string

# nltk.download('all')
nltk.download('stopwords')

from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# import torch
!pip install transformers
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
from scipy.special import softmax



In [None]:
import re
import concurrent.futures


In [None]:
class SentimentAnalyzer():
  ''' Takes in dataframe, will clean and perform sentiment analysis on it'''
  def __init__(self, data):
    self.data = data # csv file path turned to df
    self.head = self.data.head()
    self.shape = self.data.shape
    self.sample_data = None

  # def sample(self, n = 10):
  #   self.sample_data = self.data.sample(n)
  #   return True

  def get_head(self):
    return self.head

  def get_data(self):
    ''' returns data frame'''
    return self.data

  def get_sample(self):
    return self.sample_data

  def remove_punctuations(self, text):
    text = str(text)
    for char in string.punctuation:
        text = text.replace(char, '')
    return text

  def remove_stop(self, text):
    stop = stopwords.words('english')
    return ' '.join([word for word in text.split() if word not in stop])


  def clean(self, sample = False): # specify if you want to clean whole df, or the sample, by default does whole df
    if sample:
      df = self.get_sample()
      attribute_name = "sample_data"
    else:
      df = self.get_data()
      attribute_name = "data"

    df["Comments"] = df["Comments"].str.lower()
    df['Comments'] = df['Comments'].apply(self.remove_punctuations)
    df['Comments'] = df['Comments'].apply(self.remove_stop)
    df['Word Count'] = df['Comments'].str.split().str.len()
    df['Char Length'] = df['Comments'].str.len()
    df = df[df['Word Count'] <= 250]
    df = df[~df['Comments'].str.contains(r'\b\w{11,}\b')]


    setattr(self, attribute_name, df)  # Assign the modified DataFrame to the instance attribute
    self.head = self.data.head()

  def polarity_scores_roberta_batch(self, example):
    MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)
    encoded_texts = tokenizer(example, return_tensors="pt", padding=True, truncation=True, max_length=256)
    outputs = model(**encoded_texts)
    scores = outputs.logits.detach().numpy()
    scores = softmax(scores, axis=1)
    return scores


  def run_sentiment_analysis(self, sample = False, batch_size=32):
      scores = []
      emotion = []
      with concurrent.futures.ThreadPoolExecutor() as executor:
          for i in range(0, len(self.data), batch_size):
              comments_batch = self.data['Comments'].iloc[i:i+batch_size].tolist()
              try:
                  batch_scores = self.polarity_scores_roberta_batch(comments_batch)
                  for score in batch_scores:
                      scores.append({
                          'roberta_neg': score[0],
                          'roberta_neu': score[1],
                          'roberta_pos': score[2]
                      })
                  batch_emotion = [np.argmax(score) for score in batch_scores]
                  emotion.extend(batch_emotion)
              except Exception as e:
                  print(e)

          self.data['Polarity Scores'] = scores
          self.data['Emotion'] = emotion
          self.head = self.data.head()


  def get_sentiment_count(self, sample = False):
    if sample:
      return pd.DataFrame(self.sample_data['Emotion'].value_counts())
    else:
      return pd.DataFrame(self.data['Emotion'].value_counts())


In [None]:
df = pd.read_csv('dated_comms_3.csv')
batch = df.head()
df.head()

Unnamed: 0,Youtuber,Comments,Date
0,MrBeast Gaming,Subscribe and you might be able to compete one...,2020-05-20T21:44:48Z
1,MrBeast Gaming,Skeppy: $1000 Minecraft challenges<br>Mr. Beas...,2020-05-20T23:12:40Z
2,MrBeast Gaming,&quot;We need your parents permission before w...,2020-09-04T04:05:50Z
3,MrBeast Gaming,Imagine winning $10k and Mr beast asking for y...,2021-04-01T15:29:04Z
4,MrBeast Gaming,This was probs the closest Karl ever got to wi...,2022-03-27T16:56:24Z


In [None]:
sentiment = SentimentAnalyzer(df)

In [None]:
sentiment.clean()

In [None]:
sentiment.get_head()

Unnamed: 0,Youtuber,Comments,Date,Word Count,Char Length
0,MrBeast Gaming,subscribe might able compete one day,2020-05-20T21:44:48Z,6,36
3,MrBeast Gaming,imagine winning 10k mr beast asking parents pe...,2021-04-01T15:29:04Z,12,79
4,MrBeast Gaming,probs closest karl ever got winning respect,2022-03-27T16:56:24Z,7,43
5,MrBeast Gaming,i’m glad pure videos like still exist youtube ...,2020-07-28T02:41:53Z,19,121
6,MrBeast Gaming,great videos keep great work,2022-02-04T22:06:35Z,5,28


In [None]:
sentiment.run_sentiment_analysis()

In [None]:
sentiment.get_data()

Unnamed: 0,Youtuber,Comments,Date,Word Count,Char Length,Polarity Scores,Emotion
0,MrBeast Gaming,subscribe might able compete one day,2020-05-20T21:44:48Z,6,36,"{'roberta_neg': 0.010299706, 'roberta_neu': 0....",1
3,MrBeast Gaming,imagine winning 10k mr beast asking parents pe...,2021-04-01T15:29:04Z,12,79,"{'roberta_neg': 0.3314133, 'roberta_neu': 0.57...",1
4,MrBeast Gaming,probs closest karl ever got winning respect,2022-03-27T16:56:24Z,7,43,"{'roberta_neg': 0.12939624, 'roberta_neu': 0.7...",1
5,MrBeast Gaming,i’m glad pure videos like still exist youtube ...,2020-07-28T02:41:53Z,19,121,"{'roberta_neg': 0.022105018, 'roberta_neu': 0....",2
6,MrBeast Gaming,great videos keep great work,2022-02-04T22:06:35Z,5,28,"{'roberta_neg': 0.004144532, 'roberta_neu': 0....",2
...,...,...,...,...,...,...,...
9470,KittyKatGaming,suzi keep asking questions answered dialogue y...,2018-05-15T19:07:17Z,8,60,"{'roberta_neg': 0.21682782, 'roberta_neu': 0.7...",1
9471,KittyKatGaming,im sorry really miss old intro,2018-05-15T22:26:46Z,6,30,"{'roberta_neg': 0.6413672, 'roberta_neu': 0.30...",0
9473,KittyKatGaming,omg suzy love grumps i39m soooooooo glad start...,2016-04-07T03:16:53Z,12,73,"{'roberta_neg': 0.0017904623, 'roberta_neu': 0...",2
9474,KittyKatGaming,love suzy she39s adorable,2016-09-29T10:50:44Z,4,25,"{'roberta_neg': 0.0028871982, 'roberta_neu': 0...",2


In [None]:
new_df = sentiment.get_data()
new_df.head()

Unnamed: 0,Youtuber,Comments,Date,Word Count,Char Length,Polarity Scores,Emotion
0,MrBeast Gaming,subscribe might able compete one day,2020-05-20T21:44:48Z,6,36,"{'roberta_neg': 0.010299706, 'roberta_neu': 0....",1
3,MrBeast Gaming,imagine winning 10k mr beast asking parents pe...,2021-04-01T15:29:04Z,12,79,"{'roberta_neg': 0.3314133, 'roberta_neu': 0.57...",1
4,MrBeast Gaming,probs closest karl ever got winning respect,2022-03-27T16:56:24Z,7,43,"{'roberta_neg': 0.12939624, 'roberta_neu': 0.7...",1
5,MrBeast Gaming,i’m glad pure videos like still exist youtube ...,2020-07-28T02:41:53Z,19,121,"{'roberta_neg': 0.022105018, 'roberta_neu': 0....",2
6,MrBeast Gaming,great videos keep great work,2022-02-04T22:06:35Z,5,28,"{'roberta_neg': 0.004144532, 'roberta_neu': 0....",2


In [None]:
from google.colab import files

In [None]:
new_df.to_csv("yt_emotions_4.csv")
files.download('yt_emotions_4.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>