In [1]:
!pip install --upgrade google-api-python-client pymongo[srv] langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pymongo[srv]
  Downloading pymongo-4.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
[0mCollecting dnspython<3.0.0,>=1.16.0 (from pymongo[srv])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymongo-4.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... 

# MongoDB Setup

In [2]:
from pymongo import MongoClient
from google.colab import userdata
# Replace these with your credentials
username = userdata.get('mongodb_username')
password = userdata.get('mongodb_pw')
cluster_url = "cluster0.8ad48r1.mongodb.net"

In [3]:
# Full URI
uri = f"mongodb+srv://{username}:{password}@{cluster_url}/?retryWrites=true&w=majority&appName=Cluster0"

In [4]:
# Connect to MongoDB Atlas
client = MongoClient(uri)

In [5]:
# check available databases
client.list_database_names()

['social_media_analytics', 'admin', 'local']

In [6]:
# Create / select a database and collection
db = client["social_media_analytics"]

In [7]:
# Two collections
youtube_sentiment_collection = db["youtube_comments"]
youtube_trend_collection = db["youtube_tags_data"]
youtube_unique_tag_collection = db["youtube_unique_tag"]

In [8]:
# check the available collection
db.list_collection_names()

['mastodon_tags_data',
 'youtube_unique_tag',
 'mastodon_unique_tag',
 'youtube_tags_data',
 'youtube_sentiment_collection',
 'mastodon_sentiment_data']

# Get trending tags and comments at Youtube

In [9]:
# import libraries
from googleapiclient.discovery import build
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from datetime import datetime
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
from google.colab import userdata

# Setup API
YOUTUBE_API_KEY = userdata.get('YOUTUBE_API_KEY')
youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

In [11]:
# Text Preprocessing Function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'https?://\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]

    return ' '.join(filtered_tokens)

##  Fetch Trending Videos

In [12]:
from collections import Counter
from langdetect import detect
from datetime import datetime

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False  # In case of error or very short/non-text input

def fetch_youtube_trending_tags(db, region='US', max_results=25, top_n_tags=10):
    trending_videos = youtube.videos().list(
        part="snippet",
        chart="mostPopular",
        regionCode=region,
        maxResults=max_results
    ).execute()

    tag_counter = Counter()

    for video in trending_videos["items"]:
        video_id = video["id"]
        snippet = video["snippet"]
        tags = snippet.get("tags", [])
        title = snippet["title"]
        published_at = snippet["publishedAt"]

        # Save video and its tags
        db.youtube_tags_data.insert_one({
            "video_id": video_id,
            "title": title,
            "tags": tags,
            "published_at": published_at
        })

        # Count only English tags
        for tag in tags:
            tag = tag.lower()
            if is_english(tag):
                tag_counter[tag] += 1

    # Get top N trending English tags
    top_tags = tag_counter.most_common(top_n_tags)

    for tag, _ in top_tags:
        db.youtube_unique_tag.insert_one({
            "tag": tag,
            "fetched_at": datetime.utcnow()
        })

    print(f"✅ Inserted top {top_n_tags} *English* trending tags into 'youtube_unique_tag'")


In [13]:
fetch_youtube_trending_tags(db, region="US", max_results=25)

✅ Inserted top 10 *English* trending tags into 'youtube_unique_tag'


## Fetch Comments for Trending Videos

In [14]:
def fetch_comments_for_all_tagged_videos(db, min_comments=10, max_videos_per_tag=1):
    tag_cursor = db.youtube_unique_tag.find()

    for tag_doc in tag_cursor:
        tag = tag_doc["tag"]
        print(f"\n🔍 Searching videos for YouTube tag: #{tag}")

        try:
            # Search videos using the tag
            search_results = youtube.search().list(
                q=tag,
                part="snippet",
                type="video",
                maxResults=max_videos_per_tag
            ).execute()

            if not search_results["items"]:
                print("❌ No videos found for this tag.")
                continue

            for video in search_results["items"]:
                video_id = video["id"]["videoId"]
                title = video["snippet"]["title"]
                print(f"🎥 Selected video: {title} (ID: {video_id})")

                # Fetch comments
                comments_collected = 0
                next_page_token = None

                while comments_collected < min_comments:
                    response = youtube.commentThreads().list(
                        part="snippet",
                        videoId=video_id,
                        textFormat="plainText",
                        maxResults=100,
                        pageToken=next_page_token
                    ).execute()

                    items = response.get("items", [])
                    if not items:
                        break

                    for item in items:
                        if comments_collected >= min_comments:
                            break

                        comment_info = item["snippet"]["topLevelComment"]["snippet"]
                        comment_text = comment_info["textDisplay"]
                        author = comment_info["authorDisplayName"]
                        published_at = comment_info["publishedAt"]

                        cleaned_text = preprocess_text(comment_text)

                        db.youtube_sentiment_collection.insert_one({
                            "video_id": video_id,
                            "video_title": title,
                            "tag": tag,
                            "author": author,
                            "text": cleaned_text,
                            "raw_text": comment_text,
                            "published_at": published_at
                        })

                        comments_collected += 1

                    next_page_token = response.get("nextPageToken")
                    if not next_page_token:
                        break

                print(f"✅ Collected {comments_collected} comments for '{title}'")

        except Exception as e:
            print(f"❌ Error fetching comments for tag '{tag}': {str(e)}")


In [15]:
fetch_comments_for_all_tagged_videos(db, min_comments=10, max_videos_per_tag=1)


🔍 Searching videos for YouTube tag: #she will
🎥 Selected video: Lil Wayne - She Will (Lyrics) ft. Drake (ID: QMPQa7_lXOE)
✅ Collected 10 comments for 'Lil Wayne - She Will (Lyrics) ft. Drake'

🔍 Searching videos for YouTube tag: #hololive production
❌ Error fetching comments for tag 'hololive production': 'videoId'

🔍 Searching videos for YouTube tag: #hololive english
🎥 Selected video: 【Cover MV】 Team Tomodachi 【hololive English -Advent-】 (ID: 0LEc7es4_rE)
✅ Collected 10 comments for '【Cover MV】 Team Tomodachi 【hololive English -Advent-】'

🔍 Searching videos for YouTube tag: #inside the nba
🎥 Selected video: Inside the NBA REACTS to Rockets vs Warriors GAME 4 Highlights (ID: 8A1r7kl0ERQ)
✅ Collected 10 comments for 'Inside the NBA REACTS to Rockets vs Warriors GAME 4 Highlights'

🔍 Searching videos for YouTube tag: #ultimate fighting championship
🎥 Selected video: Most Brutal One Round Fights in UFC History 🔥 (ID: 2rY26BxpylE)
✅ Collected 10 comments for 'Most Brutal One Round Fights



🎥 Selected video: The Really Big Show - SHEDEUR SANDERS/BROWNS + CAVS SWEEP - 4/29/2025 (ID: M5O_e9BXZ2c)
❌ Error fetching comments for tag 'cleveland browns': <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=M5O_e9BXZ2c&textFormat=plainText&maxResults=100&key=AIzaSyD5z792tRfM92fWnH7mvLKpVnLdXba3DEg&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">

🔍 Searching videos for YouTube tag: #football
🎥 Selected video: Messi Destroys Ronaldo&#39;s Son&#39;s Watermelon Plane. What Will Happen? #ronaldo #football #messi (ID: E84icqVtwcM)
✅ Coll



🎥 Selected video: How are weather forecasts made? (ID: fdErsR8_NaU)
❌ Error fetching comments for tag 'weather forecasting': <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=fdErsR8_NaU&textFormat=plainText&maxResults=100&key=AIzaSyD5z792tRfM92fWnH7mvLKpVnLdXba3DEg&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">

🔍 Searching videos for YouTube tag: #forecasting
🎥 Selected video: What is Forecasting? | Process &amp; Benefits of Forecasting (ID: M8Kiwv9gDJU)
✅ Collected 10 comments for 'What is Forecasting? | Process &amp; Benefits of 