In [1]:
!pip install pymongo[srv] mastodon.py langdetect

Collecting mastodon.py
  Downloading mastodon_py-2.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymongo[srv]
  Downloading pymongo-4.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting python-magic (from mastodon.py)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting blurhash>=1.1.4 (from mastodon.py)
  Downloading blurhash-1.1.4-py2.py3-none-any.whl.metadata (769 bytes)
Downloading mastodon_py-2.0.1-py3-none-any.whl (108 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.5/108.5 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0

# MongoDB Setup

In [2]:
from pymongo import MongoClient
from google.colab import userdata
# Replace these with your credentials
username = userdata.get('mongodb_username')
password = userdata.get('mongodb_pw')
cluster_url = "cluster0.8ad48r1.mongodb.net"

In [3]:
# Full URI
uri = f"mongodb+srv://{username}:{password}@{cluster_url}/?retryWrites=true&w=majority&appName=Cluster0"

In [4]:
# Connect to MongoDB Atlas
client = MongoClient(uri)

In [5]:
# check available databases
client.list_database_names()

['social_media_analytics', 'admin', 'local']

In [6]:
# Create / select a database and collection
db = client["social_media_analytics"]

In [7]:
# check the available collection
db.list_collection_names()

['mastodon_tags_data',
 'youtube_unique_tag',
 'mastodon_unique_tag',
 'youtube_tags_data',
 'youtube_sentiment_collection',
 'mastodon_sentiment_data']

# Get trending tags at Mastodon

In [8]:
# Text preprocessing function
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """Clean and preprocess text for sentiment analysis"""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word not in stop_words and len(word) > 2]

    return ' '.join(filtered)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [9]:
# Initialize Mastodon API
from mastodon import Mastodon
mastodon = Mastodon(
    access_token=userdata.get('mastodon_access_token'),
    api_base_url="https://mastodon.social"
)

In [10]:
import re
from collections import Counter
from langdetect import detect
from datetime import datetime

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def extract_hashtags(text):
    return re.findall(r"#(\w+)", text)

def fetch_mastodon_trending_tags(db, top_n_tags=10, fallback_post_limit=1000):
    print("📡 Fetching Mastodon trending tags...")

    try:
        tag_counter = Counter()
        seen_tags = set()

        # === Step 1: trending_tags ===
        trends = mastodon.trending_tags()
        print(f"📊 Found {len(trends)} trending tags from Mastodon")

        for trend in trends:
            raw_tag = trend["name"]
            if not is_english(raw_tag):
                continue

            cleaned_tag = preprocess_text(raw_tag)
            if not cleaned_tag or cleaned_tag in seen_tags:
                continue

            score = sum(int(day.get("uses", 0)) for day in trend.get('history', []))

            tag_counter[cleaned_tag] += score
            seen_tags.add(cleaned_tag)

            db.mastodon_tags_data.insert_one({
                "tag_raw": raw_tag,
                "tag_clean": cleaned_tag,
                "url": trend["url"],
                "score": score,
                "fetched_at": datetime.utcnow()
            })

        # === Step 2: Supplement with hashtags from public posts ===
        if len(tag_counter) < top_n_tags:
            print("🪄 Supplementing with hashtags from public posts...")
            public_posts = mastodon.timeline_public(limit=fallback_post_limit)

            for toot in public_posts:
                raw_tags = extract_hashtags(toot["content"])
                for raw_tag in raw_tags:
                    cleaned_tag = preprocess_text(raw_tag)
                    if cleaned_tag and is_english(cleaned_tag) and cleaned_tag not in seen_tags:
                        tag_counter[cleaned_tag] += 1
                        seen_tags.add(cleaned_tag)
                        if len(tag_counter) >= top_n_tags:
                            break
                if len(tag_counter) >= top_n_tags:
                    break

        # === Step 3: Final refill if still under top_n_tags ===
        if len(tag_counter) < top_n_tags:
            print(f"🔁 Still under {top_n_tags} — attempting refill...")
            more_posts = mastodon.timeline_public(limit=300)
            for toot in more_posts:
                raw_tags = extract_hashtags(toot["content"])
                for raw_tag in raw_tags:
                    cleaned_tag = preprocess_text(raw_tag)
                    if cleaned_tag and cleaned_tag not in seen_tags:
                        tag_counter[cleaned_tag] += 1
                        seen_tags.add(cleaned_tag)
                        if len(tag_counter) >= top_n_tags:
                            break
                if len(tag_counter) >= top_n_tags:
                    break

        # === Step 4: Insert top N into MongoDB ===
        final_tags = tag_counter.most_common(top_n_tags)
        print(f"🏷 Final unique tag count: {len(final_tags)}")

        for tag, score in final_tags:
            db.mastodon_unique_tag.insert_one({
                "tag": tag,
                "score": score,
                "fetched_at": datetime.utcnow()
            })

        print(f"✅ Inserted {len(final_tags)} unique trending tags into 'mastodon_unique_tag'")

    except Exception as e:
        print(f"❌ Error: {str(e)}")


In [11]:
fetch_mastodon_trending_tags(db, top_n_tags=10)

📡 Fetching Mastodon trending tags...
📊 Found 10 trending tags from Mastodon
🪄 Supplementing with hashtags from public posts...
🔁 Still under 10 — attempting refill...
🏷 Final unique tag count: 7
✅ Inserted 7 unique trending tags into 'mastodon_unique_tag'


# Get post comments at Mastodon

In [12]:
# Load trending first 10 tags
tags = [doc["tag"] for doc in db.mastodon_unique_tag.find().limit(20)]
# tags = [doc["tag"] for doc in trend_collection.find().sort("created_at", 1).limit(10)]
print("Trending Tags:", tags)

Trending Tags: ['complaintsongsorpoems', 'throwbackthursday', 'fantasticfour', 'thursdayfivelist', 'morecowbell', 'seasideexploration', 'seasideadventure', 'childhoodmemories', 'seagullwatch', 'familyfun', 'aliensongpoemorlimerick', 'screenshotsaturday', 'tbs', 'nowplaying', 'fipgroove', 'yelpreviewsbychildren', 'electricsongs', 'thicktrunktuesday', 'thealarm', 'montreal']


In [13]:
from bs4 import BeautifulSoup
from datetime import datetime
import time
import random

def safe_status_context(post_id, retries=3, delay=2):
    for attempt in range(retries):
        try:
            return mastodon.status_context(post_id)
        except MastodonServiceUnavailableError as e:
            print(f"⚠️ Attempt {attempt+1} failed with 503. Retrying in {delay} seconds...")
            time.sleep(delay + random.uniform(0, 1))
    print(f"❌ Failed to fetch context for post {post_id} after {retries} retries.")
    return {"descendants": []}


for tag in tags:
    print(f"📌 Searching posts for #{tag}")
    posts = mastodon.timeline_hashtag(tag, limit=10)

    for post in posts:
        post_id = post['id']
        post_url = post['url']

        # Fetch replies
        context = safe_status_context(post_id)
        comments = context['descendants']

        for comment in comments:
            comment_text = comment['content']
            created_at = comment['created_at']

            # Strip HTML
            plain_text = BeautifulSoup(comment_text, "html.parser").get_text()

            # Preprocess text (you should define this function)
            cleaned_text = preprocess_text(plain_text)
            if not cleaned_text.strip():
                continue  # Skip empty after cleaning

            # Save to DB
            doc = {
                "tag": f"#{tag}",
                "text": cleaned_text,
                "created_at": created_at,
                "post_url": post_url
            }
            db.mastodon_sentiment_data.insert_one(doc)


📌 Searching posts for #complaintsongsorpoems
📌 Searching posts for #throwbackthursday
📌 Searching posts for #fantasticfour
📌 Searching posts for #thursdayfivelist
📌 Searching posts for #morecowbell
📌 Searching posts for #seasideexploration
📌 Searching posts for #seasideadventure
📌 Searching posts for #childhoodmemories
📌 Searching posts for #seagullwatch
📌 Searching posts for #familyfun
📌 Searching posts for #aliensongpoemorlimerick
📌 Searching posts for #screenshotsaturday
📌 Searching posts for #tbs
📌 Searching posts for #nowplaying
📌 Searching posts for #fipgroove
📌 Searching posts for #yelpreviewsbychildren
📌 Searching posts for #electricsongs
📌 Searching posts for #thicktrunktuesday
📌 Searching posts for #thealarm
📌 Searching posts for #montreal
