In [None]:
# ======================================
# YouTube Trending Data Analysis
# Exploratory Data Analysis
# ======================================

# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

# --------------------------------------
# 2. Load Dataset
# --------------------------------------
# Put youtube_trending.csv in the same folder
df = pd.read_csv("youtube_trending.csv")

print("First 5 rows:")
print(df.head())

# --------------------------------------
# 3. Basic Inspection
# --------------------------------------
print("\nDataset Info:")
df.info()

print("\nStatistical Summary:")
print(df.describe())

# --------------------------------------
# 4. Data Cleaning & Transformation
# --------------------------------------
# Remove duplicate videos (same video trending multiple days)
df = df.drop_duplicates(subset='video_id')

# Convert date columns
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['trending_date'] = pd.to_datetime(df['trending_date'])

# Handle missing values
df = df.dropna(subset=['views', 'likes', 'comment_count'])

# Feature engineering: Days to Trend
df['days_to_trend'] = (df['trending_date'] - df['publish_time']).dt.days

print("\nData after cleaning:")
print(df.head())

# --------------------------------------
# 5. Distribution of Engagement Metrics
# --------------------------------------
plt.figure(figsize=(6,4))
sns.histplot(df['views'], bins=50, kde=True)
plt.title("Distribution of Views")
plt.show()

# --------------------------------------
# 6. Views vs Likes
# --------------------------------------
plt.figure(figsize=(6,5))
sns.scatterplot(x='views', y='likes', data=df, alpha=0.5)
plt.title("Views vs Likes")
plt.show()

# --------------------------------------
# 7. Views vs Comments
# --------------------------------------
plt.figure(figsize=(6,5))
sns.scatterplot(x='views', y='comment_count', data=df, alpha=0.5)
plt.title("Views vs Comments")
plt.show()

# --------------------------------------
# 8. Category-wise Engagement Analysis
# --------------------------------------
category_engagement = df.groupby('category').agg({
    'views': 'mean',
    'likes': 'mean',
    'comment_count': 'mean'
}).reset_index()

plt.figure(figsize=(10,5))
sns.barplot(data=category_engagement, x='category', y='views')
plt.xticks(rotation=45)
plt.title("Average Views by Category")
plt.show()

# --------------------------------------
# 9. Correlation Heatmap
# --------------------------------------
plt.figure(figsize=(6,4))
corr = df[['views', 'likes', 'comment_count']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Between Engagement Metrics")
plt.show()

# --------------------------------------
# 10. Days to Trend Analysis
# --------------------------------------
plt.figure(figsize=(6,4))
sns.histplot(df['days_to_trend'], bins=30, kde=True)
plt.title("Distribution of Days to Trend")
plt.show()

# --------------------------------------
# 11. Final Observations (Printed)
# --------------------------------------
print("\nKey Observations:")
print("- Views and likes are strongly positively correlated.")
print("- Comment engagement varies more widely than views or likes.")
print("- Engagement patterns differ across content categories.")
print("- Most videos trend shortly after being published.")
