<a href="https://colab.research.google.com/github/kkrusere/youTube-comments-Analyzer/blob/main/YT_comments_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os

#mounting google drive
drive.mount('/content/drive')

########################################

#changing the working directory
os.chdir("/content/drive/MyDrive/EV NLP Data")

!pwd


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/EV NLP Data


## **Building a Deep Learning Sentiment Analysis Model with YouTube Comments**

we are going to do transfer-learning first and see the perfomance of the model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import string

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

import warnings
warnings.filterwarnings("ignore")

import json

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Function to save comments data to a JSON file
def save_comments_to_json(comments, filename = 'youtube_comments.json'):
    with open(filename, 'w') as json_file:
        json.dump(comments, json_file, indent=4)

def load_comments_from_json(filename = 'youtube_comments.json'):
    with open(filename, 'r') as json_file:
        comments = json.load(json_file)
    return comments

In [4]:
comments_data = load_comments_from_json()

In [5]:
# Load the JSON data into a Pandas DataFrame
comments_data_df = pd.DataFrame(comments_data)

comments_data_df.head()


Unnamed: 0,comment_text,like_count,reply_count
0,A major obstacle to EV adoption that is always...,6K,507 replies
1,A major obstacle to EV adoption that is always...,6K,507 replies
2,"Prices are too high, and dealerships keep addi...",3.9K,216 replies
3,The government isn’t fast enough to patch poth...,89,6 replies
4,We have the coldest winters in many years here...,34,1 reply


In [6]:
comments_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16477 entries, 0 to 16476
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_text  15427 non-null  object
 1   like_count    15427 non-null  object
 2   reply_count   4204 non-null   object
dtypes: object(3)
memory usage: 386.3+ KB


In [7]:
comments_data_df['comment_text_cleaned'] = None
comments_data_df.head()

Unnamed: 0,comment_text,like_count,reply_count,comment_text_cleaned
0,A major obstacle to EV adoption that is always...,6K,507 replies,
1,A major obstacle to EV adoption that is always...,6K,507 replies,
2,"Prices are too high, and dealerships keep addi...",3.9K,216 replies,
3,The government isn’t fast enough to patch poth...,89,6 replies,
4,We have the coldest winters in many years here...,34,1 reply,


In [8]:
# Drop null values in the `comment_text ` column
comments_data_df = comments_data_df.dropna(subset=['comment_text'])
# Drop duplicates if there are any in the `comment_text ` column
comments_data_df = comments_data_df.drop_duplicates(subset='comment_text')


# Lowercase the text
comments_data_df['comment_text_cleaned'] = comments_data_df['comment_text'].str.lower()
# Remove punctuation
comments_data_df['comment_text_cleaned'] = comments_data_df['comment_text_cleaned'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))

# Remove stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
comments_data_df['comment_text_cleaned'] = comments_data_df['comment_text_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


#reset dataframe index

comments_data_df.reset_index(inplace=True, drop=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
comments_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14340 entries, 0 to 14339
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   comment_text          14340 non-null  object
 1   like_count            14340 non-null  object
 2   reply_count           3906 non-null   object
 3   comment_text_cleaned  14340 non-null  object
dtypes: object(4)
memory usage: 448.2+ KB


In [10]:
comments_data_df.head()

Unnamed: 0,comment_text,like_count,reply_count,comment_text_cleaned
0,A major obstacle to EV adoption that is always...,6K,507 replies,major obstacle ev adoption always overlooked i...
1,"Prices are too high, and dealerships keep addi...",3.9K,216 replies,prices high dealerships keep adding dealership...
2,The government isn’t fast enough to patch poth...,89,6 replies,government isn’t fast enough patch potholes ro...
3,We have the coldest winters in many years here...,34,1 reply,coldest winters many years scandinavia places ...
4,"Im not fond of Tesla, but I do like the fact t...",6K,411 replies,im fond tesla like fact theyve steadily slashi...


In [11]:
random_comments = comments_data_df.sample(n=1000)
random_comments = random_comments.reset_index(drop=True)
random_comments.head()

Unnamed: 0,comment_text,like_count,reply_count,comment_text_cleaned
0,I think one the biggest problems dealership ar...,21.0,5 replies,think one biggest problems dealership problems...
1,EVs like all appliances have their pros and co...,1.0,,evs like appliances pros cons amazing short ti...
2,Love this information. Thank you for posting,,,love information thank posting
3,Hybrids are the way to go for now. Hydrogen is...,,,hybrids way go hydrogen future opinion scrap y...
4,Well better than porsche i dont think so.,,,well better porsche dont think


In [12]:
# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Apply stemming to the comments
random_comments['clean_comment_stemmed'] = random_comments['comment_text'].apply(lambda x: [stemmer.stem(word) for word in x.split()])
random_comments['clean_comment_stemmed_str'] = random_comments['clean_comment_stemmed'].apply(' '.join)

In [13]:
# Initialize the WordNet Lemmatizer
lemmatizer = WordNetLemmatizer()
# Apply lemmatization to the comments
random_comments['clean_comment_lemmatized'] = random_comments['comment_text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x.split()])
random_comments['clean_comment_lemmatized_str'] = random_comments['clean_comment_lemmatized'].apply(' '.join)



In [14]:
random_comments.head()

Unnamed: 0,comment_text,like_count,reply_count,comment_text_cleaned,clean_comment_stemmed,clean_comment_stemmed_str,clean_comment_lemmatized,clean_comment_lemmatized_str
0,I think one the biggest problems dealership ar...,21.0,5 replies,think one biggest problems dealership problems...,"[i, think, one, the, biggest, problem, dealers...",i think one the biggest problem dealership are...,"[I, think, one, the, biggest, problem, dealers...",I think one the biggest problem dealership are...
1,EVs like all appliances have their pros and co...,1.0,,evs like appliances pros cons amazing short ti...,"[ev, like, all, applianc, have, their, pro, an...",ev like all applianc have their pro and cons. ...,"[EVs, like, all, appliance, have, their, pro, ...",EVs like all appliance have their pro and cons...
2,Love this information. Thank you for posting,,,love information thank posting,"[love, thi, information., thank, you, for, post]",love thi information. thank you for post,"[Love, this, information., Thank, you, for, po...",Love this information. Thank you for posting
3,Hybrids are the way to go for now. Hydrogen is...,,,hybrids way go hydrogen future opinion scrap y...,"[hybrid, are, the, way, to, go, for, now., hyd...",hybrid are the way to go for now. hydrogen is ...,"[Hybrids, are, the, way, to, go, for, now., Hy...",Hybrids are the way to go for now. Hydrogen is...
4,Well better than porsche i dont think so.,,,well better porsche dont think,"[well, better, than, porsch, i, dont, think, so.]",well better than porsch i dont think so.,"[Well, better, than, porsche, i, dont, think, ...",Well better than porsche i dont think so.


In [15]:
!pip install kmodes



In [16]:
from kmodes.kmodes import KModes
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
def plot_silhouette_scores(cluster_range, silhouettes):
  # Plot the silhouette scores
  plt.figure(figsize=(12, 4))

  plt.plot(cluster_range, silhouettes, marker='o')
  plt.xlabel('Number of Clusters')
  plt.ylabel('Silhouette Score')
  plt.title('Silhouette Score vs. Number of Clusters')

def plot_WCSS(cluster_range, wcss):
  # Plot the within-cluster sum of squares
  plt.figure(figsize=(12, 4))
  plt.plot(cluster_range, wcss, marker='o')
  plt.xlabel('Number of Clusters')
  plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
  plt.title('WCSS vs. Number of Clusters')

  # Finding the elbow point
  x = list(cluster_range)
  y = wcss
  k = 3
  y_eps = 0.01
  elbow_point = None
  for i in range(1, len(y) - k):
      prev_slope = (y[i] - y[i - 1]) / (x[i] - x[i - 1])
      next_slope = (y[i + k] - y[i + k - 1]) / (x[i + k] - x[i + k - 1])
      if abs(next_slope - prev_slope) / abs(prev_slope) < y_eps:
          elbow_point = (x[i], y[i])
          break

  if elbow_point:
      plt.plot(elbow_point[0], elbow_point[1], marker='o', markersize=8, color='red', label='Elbow Point')
      plt.legend()

  plt.show()


In [18]:
from joblib import Parallel, delayed

def kmodes_clustering(data, cluster_range):
    # Vectorize comments
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    X = X.toarray()

    # Initialize empty lists to store the silhouette scores and kmodes models
    silhouettes = []
    kmodes_models = []
    wcss = []

    def fit_kmodes(n_clusters):
        kmodes = KModes(n_clusters=n_clusters, init='Huang', n_init=5, verbose=1)
        kmodes.fit(X)
        wcss.append(kmodes.cost_)
        kmodes_models.append(kmodes)
        labels = kmodes.labels_
        silhouette_avg = silhouette_score(X, labels)
        silhouettes.append(silhouette_avg)

    Parallel(n_jobs= 5)(delayed(fit_kmodes)(n_clusters) for n_clusters in cluster_range)

    return silhouettes, wcss


In [None]:
cluster_range = range(2, 10)

silhouettes, wcss = kmodes_clustering(random_comments['comment_text_cleaned'], cluster_range)


In [None]:
plot_silhouette_scores(cluster_range, silhouettes)
plot_WCSS(cluster_range, wcss)

In [None]:
cluster_range = range(2, 10)

silhouettes, wcss = kmodes_clustering(comments_data_df['comment_text_stemmed'], cluster_range)

In [None]:
plot_silhouette_scores(cluster_range, silhouettes)
plot_WCSS(cluster_range, wcss)

In [None]:
cluster_range = range(2, 10)

silhouettes, wcss = kmodes_clustering(comments_data_df['comment_text_lemmatized'], cluster_range)

In [None]:
plot_silhouette_scores(cluster_range, silhouettes)
plot_WCSS(cluster_range, wcss)