<a href="https://colab.research.google.com/github/kkrusere/youTube-comments-Analyzer/blob/main/YT_comments_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
import os

#mounting google drive
drive.mount('/content/drive')

########################################

#changing the working directory
os.chdir("/content/drive/MyDrive/EV NLP Data")

!pwd


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/EV NLP Data


## **Building a Deep Learning Sentiment Analysis Model with YouTube Comments**

we are going to do transfer-learning first and see the perfomance of the model

In [2]:
import pandas as pd

import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

import json

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Function to save comments data to a JSON file
def save_comments_to_json(comments, filename = 'youtube_comments.json'):
    with open(filename, 'w') as json_file:
        json.dump(comments, json_file, indent=4)

def load_comments_from_json(filename = 'youtube_comments.json'):
    with open(filename, 'r') as json_file:
        comments = json.load(json_file)
    return comments

In [4]:
comments_data = load_comments_from_json()

In [5]:
# Load the JSON data into a Pandas DataFrame
comments_data_df = pd.DataFrame(comments_data)

comments_data_df.head()


Unnamed: 0,comment_text,like_count,reply_count
0,A major obstacle to EV adoption that is always...,6K,507 replies
1,A major obstacle to EV adoption that is always...,6K,507 replies
2,"Prices are too high, and dealerships keep addi...",3.9K,216 replies
3,The government isn’t fast enough to patch poth...,89,6 replies
4,We have the coldest winters in many years here...,34,1 reply


In [6]:
comments_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16477 entries, 0 to 16476
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   comment_text  15427 non-null  object
 1   like_count    15427 non-null  object
 2   reply_count   4204 non-null   object
dtypes: object(3)
memory usage: 386.3+ KB


In [7]:
comments_data_df['comment_text_cleaned'] = None
comments_data_df.head()

Unnamed: 0,comment_text,like_count,reply_count,comment_text_cleaned
0,A major obstacle to EV adoption that is always...,6K,507 replies,
1,A major obstacle to EV adoption that is always...,6K,507 replies,
2,"Prices are too high, and dealerships keep addi...",3.9K,216 replies,
3,The government isn’t fast enough to patch poth...,89,6 replies,
4,We have the coldest winters in many years here...,34,1 reply,


In [8]:
# Drop null values in the `comment_text ` column
comments_data_df = comments_data_df.dropna(subset=['comment_text'])
# Drop duplicates if there are any in the `comment_text ` column
comments_data_df = comments_data_df.drop_duplicates(subset='comment_text')


# Lowercase the text
comments_data_df['comment_text_cleaned'] = comments_data_df['comment_text'].str.lower()
# Remove punctuation
comments_data_df['comment_text_cleaned'] = comments_data_df['comment_text_cleaned'].apply(lambda x: ''.join([char for char in x if char not in string.punctuation]))

# Remove stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
comments_data_df['comment_text_cleaned'] = comments_data_df['comment_text_cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


#reset dataframe index

comments_data_df.reset_index(inplace=True, drop=True)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
comments_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14340 entries, 0 to 14339
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   comment_text          14340 non-null  object
 1   like_count            14340 non-null  object
 2   reply_count           3906 non-null   object
 3   comment_text_cleaned  14340 non-null  object
dtypes: object(4)
memory usage: 448.2+ KB


In [10]:
comments_data_df.head()

Unnamed: 0,comment_text,like_count,reply_count,comment_text_cleaned
0,A major obstacle to EV adoption that is always...,6K,507 replies,major obstacle ev adoption always overlooked i...
1,"Prices are too high, and dealerships keep addi...",3.9K,216 replies,prices high dealerships keep adding dealership...
2,The government isn’t fast enough to patch poth...,89,6 replies,government isn’t fast enough patch potholes ro...
3,We have the coldest winters in many years here...,34,1 reply,coldest winters many years scandinavia places ...
4,"Im not fond of Tesla, but I do like the fact t...",6K,411 replies,im fond tesla like fact theyve steadily slashi...


In [11]:
!pip install kmodes



In [12]:
from kmodes.kprototypes import KPrototypes
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Vectorize comments
# Convert sparse matrix to DataFrame

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(comments_data_df['comment_text_cleaned'])
comments_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Define the range of cluster numbers to try
# Initialize empty lists to store the silhouette scores and kmodes models
cluster_range = range(2, 10)
silhouettes = []
kproto_models = []
wcss = []  # Within-cluster sum of squares

# Loop through each cluster number
for n_clusters in cluster_range:
    # Create a KPrototypes model with the current cluster number
    kproto = KPrototypes(n_clusters=n_clusters, init='Huang')

    # Fit the model to the data
    kproto.fit(comments_df, categorical=[0])  # Assuming the only categorical feature is 'comment'

    # Assign clusters to the data points
    clusters = kproto.predict(comments_df, categorical=[0])

    # Compute silhouette score
    silhouette_avg = silhouette_score(comments_df, clusters)
    silhouettes.append(silhouette_avg)

    # Compute within-cluster sum of squares (WCSS)
    wcss.append(kproto.cost_)

    # Store the model
    kproto_models.append(kproto)

# Plot the silhouette scores
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(cluster_range, silhouettes, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs. Number of Clusters')

# Plot the within-cluster sum of squares
plt.subplot(1, 2, 2)
plt.plot(cluster_range, wcss, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('WCSS vs. Number of Clusters')

plt.tight_layout()
plt.show()

In [None]:
# Create a dictionary of positive and negative words
positive_words = ["good", "great", "nice", "awesome", "love"]
negative_words = ["bad", "terrible", "awful", "hate", "dislike"]

# Create a function to label the data
def label_data(text):
  for word in positive_words:
    if word in text:
      return "positive"
  for word in negative_words:
    if word in text:
      return "negative"
  return "neutral"

# Apply the function to the comment_text column
df["sentiment"] = df["comment_text"].apply(label_data)

# Print the DataFrame
df
