In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# ---------- Setup ----------
# Make sure VADER lexicon is available
try:
    SentimentIntensityAnalyzer()
except LookupError:
    nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

In [2]:
# 2. Load CSV data
df = pd.read_csv("n-gram_youtube_comments.csv", engine="python")


In [3]:
# 3. Sentiment analysis
def vader_scores(text: str):
    return sia.polarity_scores(text)

scores = df['comment'].apply(vader_scores).apply(pd.Series)
df = pd.concat([df, scores], axis=1)

In [4]:
# Map compound score → sentiment label
def label_from_compound(c):
    if c >= 0.05:
        return "positive"
    elif c <= -0.05:
        return "negative"
    else:
        return "neutral"

df['sentiment_label'] = df['compound'].apply(label_from_compound)


In [None]:
 # Aggregate insights
label_share = (df["sentiment_label"].value_counts(normalize=True) * 100).round(2)
avg_compound = df["compound"].mean()

print("=== Aggregate ===")
print("Distribution (%):")
print(label_share)
print("Average compound:", round(avg_compound, 4))

In [None]:
# Top most positive/negative examples
top_pos = df.nlargest(5, "compound")[["comment", "compound"]]
top_neg = df.nsmallest(5, "compound")[["comment", "compound"]]

print("\nTop positive examples:")
print(top_pos.to_string(index=False))
print("\nTop negative examples:")
print(top_neg.to_string(index=False))

In [None]:
# Word-level analysis (Top tokens & bigrams per sentiment)
def top_ngrams(texts, ngram_range=(1,1), top_k=20):
    vec = CountVectorizer(stop_words="english", lowercase=True,
                          ngram_range=ngram_range, min_df=2)
    X = vec.fit_transform(texts)
    freqs = np.asarray(X.sum(axis=0)).ravel()
    vocab = np.array(vec.get_feature_names_out())
    order = freqs.argsort()[::-1][:top_k]
    return pd.DataFrame({"ngram": vocab[order], "count": freqs[order]})

pos_texts = df.loc[df["sentiment_label"]=="positive", "comment"]
neg_texts = df.loc[df["sentiment_label"]=="negative", "comment"]

top_pos_unigrams = top_ngrams(pos_texts, (1,1), top_k=20)
top_neg_unigrams = top_ngrams(neg_texts, (1,1), top_k=20)

top_pos_bigrams  = top_ngrams(pos_texts, (2,2), top_k=20)
top_neg_bigrams  = top_ngrams(neg_texts, (2,2), top_k=20)

top_pos_threegrams  = top_ngrams(pos_texts, (3,3), top_k=20)
top_neg_threegrams  = top_ngrams(neg_texts, (3,3), top_k=20)

top_pos_fourgrams  = top_ngrams(pos_texts, (4,4), top_k=20)
top_neg_fourgrams  = top_ngrams(neg_texts, (4,4), top_k=20)

print("\nTop positive unigrams:\n", top_pos_unigrams.to_string(index=False))

In [None]:
print("\nTop negative unigrams:\n", top_neg_unigrams.to_string(index=False))

In [None]:
print("\nTop positive bigrams:\n", top_pos_bigrams.to_string(index=False))

In [None]:
print("\nTop negative bigrams:\n", top_neg_bigrams.to_string(index=False))

In [None]:
print("\nTop positive threegrams:\n", top_pos_threegrams.to_string(index=False))

In [None]:
print("\nTop negative threerams:\n", top_neg_threegrams.to_string(index=False))

In [None]:
print("\nTop positive fourrams:\n", top_pos_fourgrams.to_string(index=False))

In [None]:
print("\nTop negative fourrams:\n", top_neg_fourgrams.to_string(index=False))

In [None]:
# Confidence & intensity
# (Bucket by compound magnitude)
def intensity_bucket(c):
    m = abs(c)
    if m >= 0.7:  return "very strong"
    if m >= 0.4:  return "strong"
    if m >= 0.2:  return "moderate"
    if m >  0.0:  return "weak"
    return "neutral/zero"

df["intensity"] = df["compound"].apply(intensity_bucket)

print("\nIntensity distribution (%):")
print((df["intensity"].value_counts(normalize=True)*100).round(2))

In [30]:
# Advanced modeling (Topics)
# LDA topics -> then average sentiment per topic
# Build a sparse term matrix for topics
topic_vec = CountVectorizer(stop_words="english", lowercase=True, min_df=5, max_df=0.5)
X = topic_vec.fit_transform(df["comment"])

n_topics = 8  # adjust as you like
lda = LatentDirichletAllocation(n_components=n_topics, random_state=42, learning_method="batch")
topic_dist = lda.fit_transform(X)            # shape: (n_docs, n_topics)
df["topic"] = topic_dist.argmax(axis=1)      # dominant topic per comment


In [None]:
# Show top words per topic
terms = np.array(topic_vec.get_feature_names_out())
def top_words_for_topic(topic_idx, top_k=12):
    comp = lda.components_[topic_idx]
    return terms[comp.argsort()[::-1][:top_k]]

print("\n=== Topics & average sentiment ===")
topic_summary = []
for k in range(n_topics):
    words = ", ".join(top_words_for_topic(k, 10))
    avg_c = df.loc[df["topic"]==k, "compound"].mean()
    cnt   = (df["topic"]==k).sum()
    topic_summary.append((k, cnt, round(avg_c, 4), words))
topic_df = pd.DataFrame(topic_summary, columns=["topic","n","avg_compound","top_words"])
print(topic_df.to_string(index=False))

In [None]:
# Visualization (Matplotlib)
# 7a) Bar: sentiment label distribution
label_order = ["negative","neutral","positive"]
label_counts = df["sentiment_label"].value_counts().reindex(label_order, fill_value=0)

plt.figure()
label_counts.plot(kind="bar")
plt.title("Sentiment distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [None]:
# 7b) Histogram: compound score
plt.figure()
df["compound"].plot(kind="hist", bins=50)
plt.title("Compound score distribution")
plt.xlabel("Compound")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

In [None]:
# 7c) Bar: intensity buckets
plt.figure()
df["intensity"].value_counts().plot(kind="bar")
plt.title("Sentiment intensity buckets")
plt.xlabel("Intensity")
plt.ylabel("Count")
plt.tight_layout()
plt.show()

In [34]:
# Save results
df.to_csv("sentiment_youtube_comments.csv", index=False)

print("\nSaved: sentiment_youtube_comments.csv")
print(df['sentiment_label'].value_counts())


Saved: sentiment_youtube_comments.csv
sentiment_label
neutral     39138
positive    36764
negative    14183
Name: count, dtype: int64
