# Part 1: Extracting YT comments, title and description using YT data API

# Importing the necessary Libraries

In [None]:
import os
import pandas as pd
import logging
from googleapiclient.discovery import build


# API to fetch the news details

In [8]:
!pip install python-dotenv





[notice] A new release of pip is available: 23.0.1 -> 25.0.1
[notice] To update, run: C:\Users\HP\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip


In [None]:
import os
from dotenv import load_dotenv
from googleapiclient.discovery import build

# Load environment variables from .env file
load_dotenv()

# Get API key from environment variables
API_KEY = os.getenv("API_KEY")

# Initialize YouTube API
youtube = build("youtube", "v3", developerKey=API_KEY)

# YouTube category ID mapping (API does not support violence and explicit)
CATEGORY_IDS = {
    "gaming": "20",
    "news": "25",
    "entertainment": "24",
    "sports": "17"
}
SEARCH_CATEGORIES = ["education", "violence", "explicit/18+"]  # Use search for these

# List of explicit/unsafe keywords
VIOLENCE_KEYWORDS = ["violence", "war", "murder", "gun", "blood", "kill", "crime", "assault", "shooting", "attack"]
EXPLICIT_KEYWORDS = ["explicit", "nsfw", "mature", "adult", "nude", "porn", "sex", "drugs", "abuse", "18+"]

# Categorizing the content into different categories

In [None]:
def categorize_content(title, description, tags):
    """Categorize videos as 'violence', 'explicit/18+', or normal."""
    combined_text = f"{title.lower()} {description.lower()} {' '.join(tags).lower()}"
    if any(word in combined_text for word in EXPLICIT_KEYWORDS):
        return "explicit/18+"
    elif any(word in combined_text for word in VIOLENCE_KEYWORDS):
        return "violence"
    else:
        return "normal"

# Fetching the first 10 comments

In [None]:
def get_video_comments(video_id, max_comments=10):
    """Fetch top comments for a given video."""
    comments = []
    try:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=max_comments,
            textFormat="plainText"
        )
        response = request.execute()
        for item in response.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
            comments.append(comment)
    except Exception:
        return "Comments Disabled"
    return " | ".join(comments)

# Get the top 15 trending videos according to category

In [None]:
def get_trending_videos(category_id, category_name, max_results=15):
    """Fetch trending videos in a given category (Trending API)."""
    try:
        request = youtube.videos().list(
            part="snippet,statistics",
            chart="mostPopular",
            regionCode="US",
            videoCategoryId=category_id,
            maxResults=max_results
        )
        response = request.execute()
        
        video_data = []
        for item in response.get("items", []):
            video_id = item["id"]
            title = item["snippet"]["title"]
            description = item["snippet"]["description"]
            views = item["statistics"]["viewCount"]
            likes = item["statistics"].get("likeCount", "0")
            tags = item["snippet"].get("tags", [])
            
            # Categorization
            category_final = categorize_content(title, description, tags)
            comments = get_video_comments(video_id, max_comments=15)

            video_data.append({
                "video_id": video_id,
                "title": title,
                "description": description,
                "views": views,
                "likes": likes,
                "comments": comments,
                "explicit": "yes" if category_final == "explicit/18+" else "no",
                "violent": "yes" if category_final == "violence" else "no",
                "category": category_final if category_final != "normal" else category_name
            })

        return video_data
    except Exception as e:
        logging.error(f"Error fetching trending videos for category {category_name}: {e}")
        return []

# Search videos for specific categories for which tags aren't available by YT API

In [None]:
def search_videos(query, category_name, max_results=15):
    """Search for videos instead of fetching trending (for Education, Explicit, Violence)."""
    try:
        request = youtube.search().list(
            q=query,
            part="snippet",
            type="video",
            maxResults=max_results
        )
        response = request.execute()
        
        video_data = []
        for item in response.get("items", []):
            video_id = item["id"]["videoId"]
            title = item["snippet"]["title"]
            description = item["snippet"]["description"]
            tags = item["snippet"].get("tags", [])
            
            category_final = categorize_content(title, description, tags)
            comments = get_video_comments(video_id, max_comments=10)

            video_data.append({
                "video_id": video_id,
                "title": title,
                "description": description,
                "views": "N/A",  # Search API doesn't return views
                "likes": "N/A",  # Search API doesn't return likes
                "comments": comments,
                "explicit": "yes" if category_final == "explicit/18+" else "no",
                "violent": "yes" if category_final == "violence" else "no",
                "category": category_final if category_final != "normal" else category_name
            })

        return video_data
    except Exception as e:
        logging.error(f"Error searching videos for category {category_name}: {e}")
        return []

# Fetch videos
all_videos = []

# Get trending videos from supported categories


In [None]:
for category, cat_id in CATEGORY_IDS.items():
    logging.info(f"Fetching trending videos for category: {category}")
    videos = get_trending_videos(cat_id, category, max_results=15)
    all_videos.extend(videos)

# Get searched videos for Education, Violence, Explicit
for category in SEARCH_CATEGORIES:
    logging.info(f"Searching videos for category: {category}")
    videos = search_videos(category, category, max_results=15)
    all_videos.extend(videos)

# Convert to DataFrame

In [None]:

df = pd.DataFrame(all_videos)

# Separate explicit and violence videos
explicit_videos = df[df["explicit"] == "yes"]
violent_videos = df[df["violent"] == "yes"]

# Add explicit and violent videos separately
df = pd.concat([df, explicit_videos.assign(category="explicit/18+")], ignore_index=True)
df = pd.concat([df, violent_videos.assign(category="violence")], ignore_index=True)

# Save to CSV
df.to_csv("trending_yt_vids.csv", index=False)
logging.info("Data saved to trending_yt_vids.csv")

print(df)




        video_id                                              title  \
0    B9ou3pu3xSQ  SIDEMEN AMONG US DRAFT MODE BUT EVERYONE CHOOS...   
1    PRaLwvY4SoM            I Spent $9,713 at a Card Shop in Roblox   
2    e2w2flW5_q4          Making Money Moves (Schedule 1 Episode 2)   
3    RbYEN9vAe5M  【MINECRAFT RP #NIJIEnchanted】TRYING TO DO COOL...   
4    2rifjk558yM            【BLOODBORNE】Kos, or as some say, Koseki   
..           ...                                                ...   
139  E9xsVnDz0uw  Can&#39;t Be Real! Did Crockett PROMOTE VIOLEN...   
140  3pCeKF2qhaE  Man Arrested At Airport After Suspected Domest...   
141  nkPdMQCizIQ                   女王蜂『バイオレンス(VIOLENCE)』Official MV   
142  B8Jbse2CoSU  Nagpur Curfew Lifted After Mob Violence Reduce...   
143  EX_8ZjT2sO4  Grenouer - Alone in the Dark - [UNCENSORED - A...   

                                           description    views   likes  \
0    🍗: Order food NOW at: https://www.eatsides.com...  2286244  111110 

# Part 2: Preprocessing the data stored in the data frame
Import necessary libraries

In [8]:
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from better_profanity import profanity
import pandas as pd
import re
import unicodedata
import contractions

# Preprocessing the text


In [2]:
# Load spaCy model for lemmatization
nlp = spacy.load("en_core_web_sm")

# Load stopwords
stop_words = set(stopwords.words('english'))

# Custom explicit word blacklist since yt doesn't have tags for explicit/18+ videos
custom_blacklist = {"violence", "drugs", "kill", "murder", "terrorist", "sex", "nude", "scam", "gun", "assault","explicit","xxx","porn","18+"}

# Apply various preprocessing techniques

In [3]:
def preprocess_text(text):
    if pd.isnull(text):  # Handle missing values
        return ""
    
    # Expand contractions (e.g., "can't" → "cannot", "I'm" → "I am")
    text = contractions.fix(text)
    
    # Normalize unicode characters
    text = unicodedata.normalize("NFKD", text)
    
    # Remove special characters, punctuation, and extra spaces
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    
    # Tokenization
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
    
    # Lemmatization
    lemmatized_tokens = [token.lemma_ for token in nlp(" ".join(filtered_tokens))]
    
    return " ".join(lemmatized_tokens)



# Function to classify sentiment as Safe or Harmful


In [4]:
def classify_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)['compound']  # Get compound sentiment score
    
    return "Safe" if sentiment_score >= -0.4 else "Harmful"


# Function to detect explicit content

In [5]:
def detect_explicit_content(text):
    # Check if the text contains explicit words
    if profanity.contains_profanity(text) or any(word in text.lower() for word in custom_blacklist):
        return "Harmful"
    return "Safe"

# Function to classify video as Safe or Harmful based on sentiment & explicit content

In [6]:
def classify_video(row):
    # If any text contains "18+" or "explicit", classify immediately as Harmful
    if any("18+" in row[col].lower() or "explicit" in row[col].lower() or "sex" in row[col].lower() for col in ["title", "description", "comments"]):
        return "Harmful"

    sentiments = [row["title_sentiment"], row["description_sentiment"], row["comments_sentiment"]]
    explicit_flags = [row["title_explicit"], row["description_explicit"], row["comments_explicit"]]

    # If at least 3 elements are Harmful (either sentiment OR explicit content), classify as Harmful
    if sentiments.count("Harmful") + explicit_flags.count("Harmful") >= 3:
        return "Harmful"

    return "Safe"

# Load CSV file (data collected from YouTube API)

In [10]:
df = pd.read_csv(r"C:\Users\HP\trending_yt_vids.csv")

# Preprocess text data
for col in ["title", "description", "comments"]:
    if col in df.columns:
        df[col] = df[col].astype(str).apply(preprocess_text)

# Perform sentiment analysis
df["title_sentiment"] = df["title"].apply(classify_sentiment)
df["description_sentiment"] = df["description"].apply(classify_sentiment)
df["comments_sentiment"] = df["comments"].apply(classify_sentiment)

# Detect explicit content
df["title_explicit"] = df["title"].apply(detect_explicit_content)
df["description_explicit"] = df["description"].apply(detect_explicit_content)
df["comments_explicit"] = df["comments"].apply(detect_explicit_content)

# Classify video as Safe or Harmful
df["video_classification"] = df.apply(classify_video, axis=1)


# Save results
df.to_csv("preprocessed_yt_data.csv", index=False)

print("Processing complete. Saved as 'processed_youtube_data.csv'")


Processing complete. Saved as 'processed_youtube_data.csv'
