# YouTube Video Performance Analysis with K-Means Clustering

This notebook analyzes YouTube video performance data to:
1. Identify best and worst performing videos
2. Understand factors contributing to performance
3. Discover trending keywords and topics
4. Use k-means clustering to group similar videos

## Data Sources
- `Chart data.csv` - Time-series view data per video
- `Totals.csv` - Aggregated daily totals
- `table data.csv` - Video-level performance metrics

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Load and Explore Data

In [None]:
# Load datasets
chart_data = pd.read_csv('Chart data.csv')
totals = pd.read_csv('Totals.csv')
table_data = pd.read_csv('table data.csv')

print("Dataset shapes:")
print(f"Chart data: {chart_data.shape}")
print(f"Totals: {totals.shape}")
print(f"Table data: {table_data.shape}")

In [None]:
# Display table data structure (most useful for clustering)
print("\nTable Data Columns:")
print(table_data.columns.tolist())
print("\nFirst few rows:")
table_data.head(10)

In [None]:
# Check for missing values
print("Missing values in table data:")
print(table_data.isnull().sum())

# Basic statistics
print("\nBasic statistics:")
table_data.describe()

## 2. Data Preprocessing

In [None]:
# Clean the data - remove the 'Total' row if it exists
df = table_data[table_data['Content'] != 'Total'].copy()

# Convert date columns
df['Video publish time'] = pd.to_datetime(df['Video publish time'])

# Calculate days since publication
today = datetime.now()
df['days_since_publish'] = (today - df['Video publish time']).dt.days

# Calculate derived metrics
df['avg_view_duration'] = (df['Watch time (hours)'] * 60 / df['Views']).fillna(0)  # minutes
df['views_per_day'] = df['Views'] / df['days_since_publish']
df['revenue_per_view'] = df['Estimated revenue (GBP)'] / df['Views']
df['subscriber_conversion'] = df['Subscribers'] / df['Views'] * 100
df['clicks_per_impression'] = df['Impressions click-through rate (%)']

# Handle infinite values
df = df.replace([np.inf, -np.inf], np.nan)
df = df.fillna(0)

print(f"Cleaned dataset shape: {df.shape}")
print(f"Total unique videos: {len(df)}")

## 3. Extract Keywords from Titles

In [None]:
# Extract keywords from video titles
from collections import Counter
import re

# Common stop words to exclude
stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 
              'of', 'with', 'by', 'from', 'up', 'about', 'into', 'through', 'during',
              'your', 'you', 'how', 'is', 'it', 'this', 'that', 'are', 'as', 'be'}

def extract_keywords(title):
    """Extract meaningful keywords from title"""
    # Convert to lowercase and split
    words = re.findall(r'\b[a-z]+\b', title.lower())
    # Filter out stop words and short words
    keywords = [w for w in words if w not in stop_words and len(w) > 2]
    return keywords

# Extract all keywords
all_keywords = []
for title in df['Video title']:
    all_keywords.extend(extract_keywords(str(title)))

# Count keyword frequency
keyword_counts = Counter(all_keywords)
print("Top 30 keywords across all videos:")
for keyword, count in keyword_counts.most_common(30):
    print(f"{keyword}: {count}")

In [None]:
# Add keywords to dataframe
df['keywords'] = df['Video title'].apply(lambda x: extract_keywords(str(x)))
df['keyword_string'] = df['keywords'].apply(lambda x: ', '.join(x))

# Create binary columns for top keywords
top_keywords = [kw for kw, count in keyword_counts.most_common(20)]
for keyword in top_keywords:
    df[f'has_{keyword}'] = df['keywords'].apply(lambda x: 1 if keyword in x else 0)

print(f"\nAdded {len(top_keywords)} keyword features")

## 4. Analyze Trending Keywords

In [None]:
# Define recent videos (published in last 6 months)
six_months_ago = today - timedelta(days=180)
recent_videos = df[df['Video publish time'] > six_months_ago].copy()

print(f"Total videos: {len(df)}")
print(f"Recent videos (last 6 months): {len(recent_videos)}")

# Extract keywords from recent videos
recent_keywords = []
for title in recent_videos['Video title']:
    recent_keywords.extend(extract_keywords(str(title)))

recent_keyword_counts = Counter(recent_keywords)

# Calculate trending score (recent frequency vs overall frequency)
trending_scores = {}
for keyword in recent_keyword_counts:
    if keyword in keyword_counts and keyword_counts[keyword] >= 3:  # Must appear at least 3 times overall
        overall_freq = keyword_counts[keyword] / len(df)
        recent_freq = recent_keyword_counts[keyword] / len(recent_videos)
        trending_scores[keyword] = recent_freq / overall_freq if overall_freq > 0 else 0

# Sort by trending score
trending_keywords = sorted(trending_scores.items(), key=lambda x: x[1], reverse=True)[:20]

print("\nTrending Keywords (higher score = more popular recently):")
for keyword, score in trending_keywords:
    overall_count = keyword_counts[keyword]
    recent_count = recent_keyword_counts[keyword]
    print(f"{keyword}: Trending Score={score:.2f} (Overall: {overall_count}, Recent: {recent_count})")

In [None]:
# Visualize trending keywords
trending_df = pd.DataFrame(trending_keywords, columns=['Keyword', 'Trending Score'])
trending_df = trending_df.head(15)

plt.figure(figsize=(14, 6))
plt.barh(trending_df['Keyword'], trending_df['Trending Score'], color='steelblue')
plt.xlabel('Trending Score (Recent Popularity / Overall Popularity)')
plt.title('Top Trending Keywords (Last 6 Months)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 5. Performance Analysis - Best and Worst Videos

In [None]:
# Define composite performance score
# Normalize metrics to 0-1 scale
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
performance_metrics = ['Views', 'Watch time (hours)', 'Subscribers', 
                       'Estimated revenue (GBP)', 'Impressions click-through rate (%)',
                       'views_per_day', 'subscriber_conversion']

df_scaled = df.copy()
df_scaled[performance_metrics] = scaler.fit_transform(df[performance_metrics])

# Calculate composite performance score (weighted average)
weights = {
    'Views': 0.25,
    'Watch time (hours)': 0.20,
    'Subscribers': 0.15,
    'Estimated revenue (GBP)': 0.15,
    'Impressions click-through rate (%)': 0.10,
    'views_per_day': 0.10,
    'subscriber_conversion': 0.05
}

df['performance_score'] = sum(df_scaled[metric] * weight 
                               for metric, weight in weights.items())

# Sort by performance
df_sorted = df.sort_values('performance_score', ascending=False)

print("Top 10 Best Performing Videos:")
print("="*100)
top_10 = df_sorted.head(10)[['Video title', 'Views', 'Watch time (hours)', 
                               'Subscribers', 'performance_score']]
for idx, row in top_10.iterrows():
    print(f"\n{row['Video title'][:80]}")
    print(f"  Views: {row['Views']:,} | Watch Time: {row['Watch time (hours)']:.1f}h | "
          f"Subscribers: {row['Subscribers']:,} | Score: {row['performance_score']:.3f}")

In [None]:
print("\n\nBottom 10 Worst Performing Videos:")
print("="*100)
bottom_10 = df_sorted.tail(10)[['Video title', 'Views', 'Watch time (hours)', 
                                  'Subscribers', 'performance_score']]
for idx, row in bottom_10.iterrows():
    print(f"\n{row['Video title'][:80]}")
    print(f"  Views: {row['Views']:,} | Watch Time: {row['Watch time (hours)']:.1f}h | "
          f"Subscribers: {row['Subscribers']:,} | Score: {row['performance_score']:.3f}")

In [None]:
# Visualize performance distribution
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Performance score distribution
axes[0, 0].hist(df['performance_score'], bins=30, color='steelblue', edgecolor='black')
axes[0, 0].set_xlabel('Performance Score')
axes[0, 0].set_ylabel('Number of Videos')
axes[0, 0].set_title('Distribution of Video Performance Scores')
axes[0, 0].axvline(df['performance_score'].median(), color='red', 
                   linestyle='--', label='Median')
axes[0, 0].legend()

# Views vs Watch Time
axes[0, 1].scatter(df['Views'], df['Watch time (hours)'], alpha=0.5)
axes[0, 1].set_xlabel('Views')
axes[0, 1].set_ylabel('Watch Time (hours)')
axes[0, 1].set_title('Views vs Watch Time')

# CTR vs Performance Score
axes[1, 0].scatter(df['Impressions click-through rate (%)'], 
                   df['performance_score'], alpha=0.5, color='green')
axes[1, 0].set_xlabel('Click-Through Rate (%)')
axes[1, 0].set_ylabel('Performance Score')
axes[1, 0].set_title('CTR vs Performance Score')

# Views per day vs Days since publish
axes[1, 1].scatter(df['days_since_publish'], df['views_per_day'], alpha=0.5, color='orange')
axes[1, 1].set_xlabel('Days Since Publication')
axes[1, 1].set_ylabel('Views Per Day')
axes[1, 1].set_title('Video Age vs Daily Views')

plt.tight_layout()
plt.show()

## 6. K-Means Clustering Analysis

In [None]:
# Select features for clustering
clustering_features = [
    'Views',
    'Watch time (hours)',
    'Subscribers',
    'Estimated revenue (GBP)',
    'Impressions click-through rate (%)',
    'avg_view_duration',
    'views_per_day',
    'subscriber_conversion',
    'Duration'
]

# Prepare data for clustering
X = df[clustering_features].copy()

# Standardize features
scaler_cluster = StandardScaler()
X_scaled = scaler_cluster.fit_transform(X)

print(f"Clustering data shape: {X_scaled.shape}")
print(f"Features used: {clustering_features}")

In [None]:
# Determine optimal number of clusters using elbow method
inertias = []
silhouette_scores = []
K_range = range(2, 11)

from sklearn.metrics import silhouette_score

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))

# Plot elbow curve
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(K_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (k)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method for Optimal k')
axes[0].grid(True)

axes[1].plot(K_range, silhouette_scores, 'ro-')
axes[1].set_xlabel('Number of Clusters (k)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Score for Optimal k')
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Find best k based on silhouette score
best_k = K_range[np.argmax(silhouette_scores)]
print(f"\nRecommended number of clusters: {best_k}")
print(f"Best silhouette score: {max(silhouette_scores):.3f}")

In [None]:
# Perform k-means clustering with optimal k
optimal_k = 5  # You can adjust this based on the elbow plot
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['cluster'] = kmeans.fit_predict(X_scaled)

print(f"Videos per cluster:")
print(df['cluster'].value_counts().sort_index())

In [None]:
# Analyze cluster characteristics
cluster_analysis = df.groupby('cluster')[clustering_features + ['performance_score']].mean()
cluster_counts = df['cluster'].value_counts().sort_index()

print("\nCluster Characteristics (Average Values):")
print("="*120)
print(cluster_analysis.round(2))

# Name clusters based on characteristics
cluster_names = {}
for cluster_id in range(optimal_k):
    cluster_data = cluster_analysis.loc[cluster_id]
    
    if cluster_data['performance_score'] > 0.5:
        if cluster_data['Subscribers'] > cluster_analysis['Subscribers'].median():
            cluster_names[cluster_id] = "ðŸŒŸ Star Performers (High Engagement)"
        else:
            cluster_names[cluster_id] = "ðŸ“ˆ Viral Videos (High Views)"
    elif cluster_data['performance_score'] > 0.3:
        cluster_names[cluster_id] = "âœ… Solid Performers"
    else:
        if cluster_data['views_per_day'] < cluster_analysis['views_per_day'].median():
            cluster_names[cluster_id] = "ðŸ“‰ Underperformers"
        else:
            cluster_names[cluster_id] = "ðŸ”„ Growing Videos"

df['cluster_name'] = df['cluster'].map(cluster_names)

print("\n\nCluster Names and Sizes:")
for cluster_id, name in cluster_names.items():
    count = cluster_counts[cluster_id]
    print(f"Cluster {cluster_id}: {name} ({count} videos)")

In [None]:
# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df['pca1'] = X_pca[:, 0]
df['pca2'] = X_pca[:, 1]

plt.figure(figsize=(14, 8))
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']

for cluster_id in range(optimal_k):
    cluster_data = df[df['cluster'] == cluster_id]
    plt.scatter(cluster_data['pca1'], cluster_data['pca2'], 
                c=colors[cluster_id], label=cluster_names[cluster_id],
                alpha=0.6, s=100, edgecolors='black', linewidth=0.5)

plt.xlabel(f'First Principal Component ({pca.explained_variance_ratio_[0]:.1%} variance)')
plt.ylabel(f'Second Principal Component ({pca.explained_variance_ratio_[1]:.1%} variance)')
plt.title('Video Clusters Visualization (PCA Projection)')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 7. Cluster Deep Dive

In [None]:
# Analyze each cluster in detail
for cluster_id in range(optimal_k):
    print(f"\n{'='*100}")
    print(f"Cluster {cluster_id}: {cluster_names[cluster_id]}")
    print(f"{'='*100}")
    
    cluster_videos = df[df['cluster'] == cluster_id].copy()
    
    # Summary statistics
    print(f"\nNumber of videos: {len(cluster_videos)}")
    print(f"Average performance score: {cluster_videos['performance_score'].mean():.3f}")
    print(f"Average views: {cluster_videos['Views'].mean():,.0f}")
    print(f"Average watch time: {cluster_videos['Watch time (hours)'].mean():.1f} hours")
    print(f"Average CTR: {cluster_videos['Impressions click-through rate (%)'].mean():.2f}%")
    
    # Top keywords in this cluster
    cluster_keywords = []
    for keywords in cluster_videos['keywords']:
        cluster_keywords.extend(keywords)
    
    cluster_keyword_counts = Counter(cluster_keywords)
    print(f"\nTop keywords in this cluster:")
    for keyword, count in cluster_keyword_counts.most_common(10):
        print(f"  - {keyword}: {count}")
    
    # Sample videos
    print(f"\nSample videos from this cluster:")
    sample = cluster_videos.nlargest(3, 'Views')[['Video title', 'Views', 'performance_score']]
    for idx, row in sample.iterrows():
        print(f"  â€¢ {row['Video title'][:70]}")
        print(f"    Views: {row['Views']:,} | Score: {row['performance_score']:.3f}")

## 8. Success Factor Analysis

In [None]:
# Correlation analysis
correlation_features = [
    'Views', 'Watch time (hours)', 'Subscribers', 
    'Estimated revenue (GBP)', 'Impressions click-through rate (%)',
    'avg_view_duration', 'views_per_day', 'Duration', 'performance_score'
]

correlation_matrix = df[correlation_features].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

print("\nStrongest correlations with performance score:")
perf_correlations = correlation_matrix['performance_score'].sort_values(ascending=False)[1:]
for feature, corr in perf_correlations.items():
    print(f"  {feature}: {corr:.3f}")

In [None]:
# Analyze impact of top keywords on performance
keyword_impact = {}
for keyword in top_keywords:
    has_keyword = df[df[f'has_{keyword}'] == 1]['performance_score'].mean()
    no_keyword = df[df[f'has_{keyword}'] == 0]['performance_score'].mean()
    keyword_impact[keyword] = has_keyword - no_keyword

keyword_impact_df = pd.DataFrame(list(keyword_impact.items()), 
                                  columns=['Keyword', 'Performance Impact'])
keyword_impact_df = keyword_impact_df.sort_values('Performance Impact', ascending=False)

plt.figure(figsize=(14, 8))
colors = ['green' if x > 0 else 'red' for x in keyword_impact_df['Performance Impact']]
plt.barh(keyword_impact_df['Keyword'], keyword_impact_df['Performance Impact'], color=colors)
plt.xlabel('Performance Impact (Difference in Average Score)')
plt.title('Keyword Impact on Video Performance')
plt.axvline(x=0, color='black', linestyle='-', linewidth=0.5)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nKeywords with highest positive impact:")
print(keyword_impact_df.head(10).to_string(index=False))

## 9. Trending Topics Analysis

In [None]:
# Analyze performance of trending keywords
trending_keyword_performance = {}

for keyword, _ in trending_keywords[:15]:
    # Get videos with this keyword
    videos_with_keyword = df[df['keywords'].apply(lambda x: keyword in x)]
    
    if len(videos_with_keyword) > 0:
        avg_perf = videos_with_keyword['performance_score'].mean()
        avg_views = videos_with_keyword['Views'].mean()
        count = len(videos_with_keyword)
        
        trending_keyword_performance[keyword] = {
            'avg_performance': avg_perf,
            'avg_views': avg_views,
            'video_count': count
        }

trending_perf_df = pd.DataFrame(trending_keyword_performance).T
trending_perf_df = trending_perf_df.sort_values('avg_performance', ascending=False)

print("\nTrending Keywords Performance Analysis:")
print("="*80)
print(f"{'Keyword':<20} {'Avg Performance':<18} {'Avg Views':<15} {'Video Count'}")
print("="*80)
for keyword, row in trending_perf_df.iterrows():
    print(f"{keyword:<20} {row['avg_performance']:<18.3f} {row['avg_views']:<15,.0f} {row['video_count']:.0f}")

In [None]:
# Create a scatter plot of trending keywords
plt.figure(figsize=(14, 8))

scatter = plt.scatter(trending_perf_df['avg_views'], 
                      trending_perf_df['avg_performance'],
                      s=trending_perf_df['video_count']*50,
                      alpha=0.6, c=range(len(trending_perf_df)), cmap='viridis',
                      edgecolors='black', linewidth=1)

# Add labels
for keyword, row in trending_perf_df.iterrows():
    plt.annotate(keyword, 
                 (row['avg_views'], row['avg_performance']),
                 xytext=(5, 5), textcoords='offset points',
                 fontsize=9, alpha=0.8)

plt.xlabel('Average Views')
plt.ylabel('Average Performance Score')
plt.title('Trending Keywords: Performance vs Views (bubble size = video count)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 10. Recommendations and Insights

In [None]:
print("\n" + "="*100)
print("KEY INSIGHTS AND RECOMMENDATIONS")
print("="*100)

# 1. Best performing cluster
best_cluster = cluster_analysis['performance_score'].idxmax()
print(f"\n1. BEST PERFORMING CLUSTER: {cluster_names[best_cluster]}")
print(f"   - Average performance score: {cluster_analysis.loc[best_cluster, 'performance_score']:.3f}")
print(f"   - Average views: {cluster_analysis.loc[best_cluster, 'Views']:,.0f}")
print(f"   - Average CTR: {cluster_analysis.loc[best_cluster, 'Impressions click-through rate (%)']:.2f}%")

# 2. Top keywords to focus on
print("\n2. HIGH-IMPACT KEYWORDS TO FOCUS ON:")
top_impact_keywords = keyword_impact_df.head(5)
for _, row in top_impact_keywords.iterrows():
    print(f"   - {row['Keyword']}: +{row['Performance Impact']:.3f} impact")

# 3. Trending topics
print("\n3. TRENDING TOPICS (Most Popular Recently):")
for keyword, score in trending_keywords[:5]:
    print(f"   - {keyword}: Trending score {score:.2f}")

# 4. Success factors
print("\n4. KEY SUCCESS FACTORS (Strongest Correlations with Performance):")
top_correlations = perf_correlations.head(5)
for feature, corr in top_correlations.items():
    print(f"   - {feature}: {corr:.3f} correlation")

# 5. Improvement opportunities
worst_cluster = cluster_analysis['performance_score'].idxmin()
print(f"\n5. IMPROVEMENT OPPORTUNITIES: {cluster_names[worst_cluster]}")
print(f"   - Number of videos: {cluster_counts[worst_cluster]}")
print(f"   - Average performance: {cluster_analysis.loc[worst_cluster, 'performance_score']:.3f}")
print(f"   - Consider improving: CTR, watch time, and thumbnail quality")

# 6. Optimal video characteristics
top_performers = df.nlargest(20, 'performance_score')
print("\n6. OPTIMAL VIDEO CHARACTERISTICS (Based on Top 20 Performers):")
print(f"   - Average duration: {top_performers['Duration'].mean()/60:.1f} minutes")
print(f"   - Average CTR: {top_performers['Impressions click-through rate (%)'].mean():.2f}%")
print(f"   - Average watch time per view: {top_performers['avg_view_duration'].mean():.1f} minutes")
print(f"   - Average subscriber conversion: {top_performers['subscriber_conversion'].mean():.2f}%")

In [None]:
# Export results
output_df = df[['Video title', 'Views', 'Watch time (hours)', 'Subscribers', 
                'Estimated revenue (GBP)', 'performance_score', 'cluster', 
                'cluster_name', 'keyword_string']].copy()

output_df.to_csv('youtube_analysis_results.csv', index=False)
print("\n\nResults exported to 'youtube_analysis_results.csv'")

# Export cluster summary
cluster_summary = cluster_analysis.copy()
cluster_summary['cluster_name'] = cluster_summary.index.map(cluster_names)
cluster_summary['video_count'] = cluster_counts
cluster_summary.to_csv('cluster_summary.csv')
print("Cluster summary exported to 'cluster_summary.csv'")

# Export trending keywords
trending_df = pd.DataFrame(trending_keywords, columns=['Keyword', 'Trending Score'])
trending_df.to_csv('trending_keywords.csv', index=False)
print("Trending keywords exported to 'trending_keywords.csv'")

## Summary

This notebook has analyzed your YouTube video performance using k-means clustering and provided:

1. **Performance Classification**: Videos grouped into distinct performance clusters
2. **Best/Worst Performers**: Clear identification of top and bottom videos
3. **Success Factors**: Key metrics correlated with high performance
4. **Trending Keywords**: Topics gaining popularity in recent months
5. **Keyword Impact**: Which keywords in titles correlate with better performance
6. **Actionable Recommendations**: Data-driven suggestions for content strategy

The exported CSV files contain detailed results for further analysis.