In [20]:
import pandas as pd
import numpy as np
import glob
import re
from textblob import TextBlob
import matplotlib.pyplot as plt
import seaborn as sns


In [22]:

def load_and_combine_data(path_pattern):
    files = glob.glob(path_pattern)
    dfs = []
    for file in files:
        df = pd.read_csv(file, encoding='latin1')  # fallback for encoding
        df['region'] = file.split('/')[-1].split('_')[0]  # Extract country code
        dfs.append(df)
    combined_df = pd.concat(dfs, ignore_index=True)
    return combined_df

In [24]:
f = load_and_combine_data(r"C:\Users\HP\Favorites\Downloads\youtube_trending_videos_global.csv")

  df = pd.read_csv(file, encoding='latin1')  # fallback for encoding


In [25]:
# Step 2: Clean and Standardize Columns
def clean_data(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    df['trending_date'] = pd.to_datetime(df['trending_date'], errors='coerce', format='%y.%d.%m')
    df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
    df['title'] = df['title'].astype(str)
    df['tags'] = df['tags'].astype(str).str.lower().str.replace('"', '')
    df.dropna(subset=['video_id', 'title', 'category_id'], inplace=True)
    return df

df = clean_data(df)

NameError: name 'df' is not defined

In [None]:
# Step 3: Sentiment Analysis on Titles and Tags
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

df['title_sentiment'] = df['title'].apply(get_sentiment)
df['tags_sentiment'] = df['tags'].apply(get_sentiment)

In [None]:
# Step 4: Prepare SQL-friendly CSV for Views per Category
df_sql = df[['video_id', 'title', 'category_id', 'views', 'region']]
df_sql.to_csv("youtube_sql_ready.csv", index=False)

In [None]:
# Step 5: Time-Series for Trending Duration
trending_counts = df.groupby(['video_id', 'region'])['trending_date'].nunique().reset_index()
trending_counts.columns = ['video_id', 'region', 'trending_days']


In [None]:
# Visualization Prep: Save region-wise trending duration
trending_counts.to_csv("trending_duration_by_region.csv", index=False)


In [None]:
# Optional Quick Plot for EDA
plt.figure(figsize=(10, 6))
sns.histplot(df['title_sentiment'], bins=30, kde=True)
plt.title("Sentiment Distribution in Video Titles")
plt.xlabel("Sentiment Polarity")
plt.ylabel("Frequency")
plt.tight_layout()
plt.savefig("title_sentiment_distribution.png")
plt.show()