In [46]:
!pip install vaderSentiment

import pandas as pd
import datetime
from ipywidgets import interact, IntSlider,fixed
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt
import seaborn as sns

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[31mERROR: Operation cancelled by user[0m[31m
[0m

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [47]:
def load_data(path: str) -> pd.DataFrame:
  data = pd.read_csv(path)
  data.dropna(subset=["comment_text"], inplace=True)
  print("Timestamp: ", data["timestamp"])
  data["timestamp"] = pd.to_datetime(data["timestamp"], format="mixed", utc=True)
  return data


In [48]:
def clean_text(text):
  text = text.lower()
  text = re.sub(r"http\S+|@\w+|#\w+|[^a-z\s]", "", text)
  tokens = [word for word in text.split() if word not in stop_words]
  return " ".join(tokens)


In [49]:

analyzer = SentimentIntensityAnalyzer()

def get_sentiment(text):
  sentiment = analyzer.polarity_scores(text)
  return sentiment["compound"]

In [50]:
def extract_topics(texts, n_topics=5, n_words=10):
  vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words="english")
  tfidf = vectorizer.fit_transform(texts)

  model = NMF(n_components=n_topics, random_state=42)
  W = model.fit_transform(tfidf)
  H = model.components_

  topics = []
  feature_names = vectorizer.get_feature_names_out()
  for topic_idx, topic in enumerate(H):
    top_features = [feature_names[i] for i in topic.argsort()[:-n_words-1:-1]]
    topics.append(top_features)
  return topics

In [51]:
def plot_sentiment_trend(df, top_n=20):
    sentiment = df.groupby("media_id")["sentiment"].mean()
    comment_count = df.groupby("media_id")["comment_text"].count()

    summary_df = pd.DataFrame({
        "avg_sentiment": sentiment,
        "comment_count": comment_count
    })

    summary_df = summary_df.sort_values("comment_count", ascending=False).head(top_n)

    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(
        summary_df["avg_sentiment"],
        summary_df["comment_count"],
        s=summary_df["comment_count"] * 5,  # bubble size
        c=summary_df["avg_sentiment"],
        cmap="coolwarm",
        alpha=0.7,
        edgecolors="w"
    )

    for media_id, row in summary_df.iterrows():
        plt.text(row["avg_sentiment"], row["comment_count"] + 1, str(media_id)[-4:], fontsize=8)

    plt.title("Post Sentiment vs. Engagement (Top {} Posts)".format(top_n))
    plt.xlabel("Average Sentiment")
    plt.ylabel("Number of Comments")
    plt.colorbar(scatter, label="Sentiment Score")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [52]:

def main():
    df = load_data("engagements.csv")
    df['cleaned_comment'] = df['comment_text'].apply(clean_text)
    df['sentiment'] = df['cleaned_comment'].apply(get_sentiment)

    print("Top 5 Topics:")
    topics = extract_topics(df['cleaned_comment'])
    for i, topic in enumerate(topics):
        print(f"Topic {i+1}: {', '.join(topic)}")
    interact(plot_sentiment_trend, df=fixed(df), top_n=IntSlider(min=5, max=100, step=5, value=30))

if __name__ == "__main__":
    main()

Timestamp:  0        2025-03-01 00:13:57.153000+00:00
1        2025-03-01 00:23:06.879000+00:00
2        2025-03-01 00:04:05.094000+00:00
3        2025-03-01 00:41:59.467000+00:00
4        2025-03-01 02:21:29.715000+00:00
                       ...               
17836    2025-04-02 18:27:20.065000+00:00
17837    2025-04-02 18:28:41.488000+00:00
17838    2025-04-02 18:31:56.153000+00:00
17839    2025-04-02 18:29:08.782000+00:00
17840    2025-04-02 18:29:59.086000+00:00
Name: timestamp, Length: 17812, dtype: object
Top 5 Topics:
Topic 1: treehut, pr, trent, omg, treehur, dream, treehunt, list, amazing, pls
Topic 2: hut, tree, pr, products, trent, hunt, best, body, amazing, favorite
Topic 3: love, scent, products, stuff, try, good, scrub, scrubs, win, omg
Topic 4: need, try, omg, want, real, scent, im, oh, favorite, like
Topic 5: treehutpr, pleaseee, duhhh, brand, favorite, brands, shower, lol, use, products


interactive(children=(IntSlider(value=30, description='top_n', min=5, step=5), Output()), _dom_classes=('widge…