In [None]:
import requests
import json
import time
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import os
from tqdm import tqdm


API_KEYS = [
    "AIzaSyCD2fRqX0HonwyOs_sXKFEKYAceRtNw2N4",
    "AIzaSyAtRRiS4SlkfAI7lv-1ZXme87B6tTPlMi4",
    "AIzaSyAX_57FY3W20faltxN11SfoqPWo36eOSAk",
    "AIzaSyCelaZMPDr6Ag8rkqAyDWjiY96snfc8wKs",
    "AIzaSyAId-MH2iIq-ngEHKNBtBG7N1tzVbGR0JY",
    "AIzaSyC_EGQ8DTPPqWCQykXX-4CxCObYVGhmScg",
    "AIzaSyBL-5jJ3gvegwUatAuSt4i82SOAy9Z4464"
]

current_key_index = 0
MAX_RETRIES = 3
REQUEST_DELAY = 1
exhausted_keys = set()

def get_next_key():
    """Rotate through API keys, skipping exhausted ones"""
    global current_key_index
    start_index = current_key_index

    while True:
        key = API_KEYS[current_key_index]
        current_key_index = (current_key_index + 1) % len(API_KEYS)

        if key not in exhausted_keys:
            return key


        if current_key_index == start_index:
            return None

def make_api_request(url, params):
    """Make API request with retries and key rotation"""
    for attempt in range(MAX_RETRIES):
        key = get_next_key()
        if key is None:
            print("All API keys exhausted")
            return None

        params['key'] = key
        try:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 403:
                if 'quotaExceeded' in response.text:
                    print(f"Quota exceeded for key {key}, marking as exhausted")
                    exhausted_keys.add(key)
                    continue
                else:
                    print(f"Error 403: {response.text}")
                    return None
            else:
                print(f"Error {response.status_code}: {response.text}")
                return None
        except Exception as e:
            print(f"Request failed: {str(e)}")
            time.sleep(REQUEST_DELAY * (attempt + 1))

    print("Max retries exceeded for this request")
    return None

def get_trending_videos(region_code='US', max_results=50):
    """Fetch currently trending videos"""
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        'part': 'snippet,contentDetails,statistics,status',
        'chart': 'mostPopular',
        'regionCode': region_code,
        'maxResults': min(max_results, 50),
        'hl': 'en'
    }

    return make_api_request(url, params)

def get_video_details(video_ids):
    """Get details for specific video IDs"""
    url = "https://www.googleapis.com/youtube/v3/videos"
    params = {
        'part': 'snippet,contentDetails,statistics,status',
        'id': ','.join(video_ids),
        'hl': 'en'
    }

    return make_api_request(url, params)

def get_channel_details(channel_id):
    """Get channel statistics"""
    url = "https://www.googleapis.com/youtube/v3/channels"
    params = {
        'part': 'statistics',
        'id': channel_id
    }

    return make_api_request(url, params)

def process_video_data(video_items, region):
    """Process raw video data into structured format"""
    processed_data = []

    for item in video_items:
        try:
            # Basic video info
            video_id = item['id']
            snippet = item.get('snippet', {})
            stats = item.get('statistics', {})
            content_details = item.get('contentDetails', {})
            status = item.get('status', {})

            channel_id = snippet.get('channelId')
            channel_title = snippet.get('channelTitle')

            channel_stats = {}
            if channel_id:
                channel_response = get_channel_details(channel_id)
                if channel_response and 'items' in channel_response and channel_response['items']:
                    channel_stats = channel_response['items'][0].get('statistics', {})

            duration = content_details.get('duration', 'PT0S')
            duration_seconds = iso8601_to_seconds(duration)

            published_at = snippet.get('publishedAt')
            days_since_published = 0
            if published_at:
                publish_date = datetime.strptime(published_at, '%Y-%m-%dT%H:%M:%SZ')
                days_since_published = (datetime.utcnow() - publish_date).days

            thumbnails = snippet.get('thumbnails', {})
            thumbnail_resolutions = {
                'default': thumbnails.get('default', {}).get('url', ''),
                'medium': thumbnails.get('medium', {}).get('url', ''),
                'high': thumbnails.get('high', {}).get('url', ''),
                'standard': thumbnails.get('standard', {}).get('url', ''),
                'maxres': thumbnails.get('maxres', {}).get('url', '')
            }

            video_data = {
                'video_id': video_id,
                'title': snippet.get('title'),
                'description': snippet.get('description'),
                'published_at': published_at,
                'days_since_published': days_since_published,
                'channel_id': channel_id,
                'channel_title': channel_title,
                'channel_subscribers': int(channel_stats.get('subscriberCount', 0)),
                'channel_views': int(channel_stats.get('viewCount', 0)),
                'channel_video_count': int(channel_stats.get('videoCount', 0)),
                'category_id': snippet.get('categoryId'),
                'tags': ','.join(snippet.get('tags', [])),
                'duration_seconds': duration_seconds,
                'definition': content_details.get('definition'),
                'caption': content_details.get('caption'),
                'licensed_content': content_details.get('licensedContent', False),
                'view_count': int(stats.get('viewCount', 0)),
                'like_count': int(stats.get('likeCount', 0)),
                'dislike_count': int(stats.get('dislikeCount', 0)),
                'comment_count': int(stats.get('commentCount', 0)),
                'favorite_count': int(stats.get('favoriteCount', 0)),
                'embeddable': status.get('embeddable', False),
                'public_stats_viewable': status.get('publicStatsViewable', False),
                'made_for_kids': status.get('madeForKids', False),
                'region': region,
                **thumbnail_resolutions
            }

            processed_data.append(video_data)

        except Exception as e:
            print(f"Error processing video {item.get('id')}: {str(e)}")
            continue

    return processed_data

def iso8601_to_seconds(duration):
    """Convert ISO 8601 duration to seconds"""
    try:
        duration = duration[2:]
        seconds = 0

        if 'H' in duration:
            hours_part = duration.split('H')[0]
            seconds += int(hours_part) * 3600
            duration = duration[len(hours_part)+1:]

        if 'M' in duration:
            minutes_part = duration.split('M')[0]
            seconds += int(minutes_part) * 60
            duration = duration[len(minutes_part)+1:]

        if 'S' in duration:
            seconds_part = duration.split('S')[0]
            seconds += int(seconds_part)

        return seconds
    except:
        return 0

def collect_data(regions=['US', 'IN', 'GB', 'CA', 'AU', 'DE', 'FR', 'BR', 'JP', 'KR']):
    """Main function to collect data from multiple regions until quotas are exhausted"""
    all_data = pd.DataFrame()
    region_data_counts = {region: 0 for region in regions}

    os.makedirs('youtube_data', exist_ok=True)

    with tqdm(total=len(regions)*1000, desc="Collecting data") as pbar:
        while len(exhausted_keys) < len(API_KEYS):
            for region in regions:
                if region_data_counts[region] >= 1000:
                    continue

                remaining = 1000 - region_data_counts[region]
                max_results = min(50, remaining)

                trending_response = get_trending_videos(region_code=region, max_results=max_results)

                if not trending_response:
                    print(f"Failed to get data for region {region}")
                    continue

                if 'items' not in trending_response:
                    print(f"No items in response for region {region}")
                    continue

                video_items = trending_response['items']
                processed_data = process_video_data(video_items, region)  # Pass region here

                if processed_data:
                    new_data = pd.DataFrame(processed_data)
                    region_data_counts[region] += len(new_data)

                    all_data = pd.concat([all_data, new_data], ignore_index=True)
                    pbar.update(len(new_data))

                if all(count >= 1000 for count in region_data_counts.values()):
                    break

                time.sleep(REQUEST_DELAY)

            if all(count >= 1000 for count in region_data_counts.values()):
                break

    return all_data

def save_to_csv(df, filename):
    """Save collected data to CSV with shuffled records"""
    shuffled_df = df.sample(frac=1).reset_index(drop=True)

    shuffled_df.to_csv(filename, index=False)
    print(f"Data saved to {filename} with {len(shuffled_df)} records")

if __name__ == "__main__":
    print("Starting YouTube data collection...")
    print(f"Total API keys available: {len(API_KEYS)}")

    video_data = collect_data()

    if not video_data.empty:
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = f"youtube_data/youtube_video_data_{timestamp}_shuffled.csv"
        save_to_csv(video_data, filename)

        print("\nFinal Data Collection Summary:")
        print(f"Total videos collected: {len(video_data)}")
        print(f"Average views: {video_data['view_count'].mean():,.0f}")
        print(f"Average likes: {video_data['like_count'].mean():,.0f}")
        print(f"Average days since published: {video_data['days_since_published'].mean():.1f}")
        print("\nRecords per region:")
        print(video_data['region'].value_counts())
        print(f"\nAPI keys exhausted: {len(exhausted_keys)}/{len(API_KEYS)}")
    else:
        print("No data was collected.")

Starting YouTube data collection...
Total API keys available: 7


Collecting data: 100%|██████████| 10000/10000 [14:56<00:00, 11.16it/s]


Data saved to youtube_data/youtube_video_data_20250625_132841_shuffled.csv with 10000 records

Final Data Collection Summary:
Total videos collected: 10000
Average views: 3,948,005
Average likes: 143,416
Average days since published: 1.7

Records per region:
region
US    1000
IN    1000
GB    1000
CA    1000
AU    1000
DE    1000
FR    1000
BR    1000
JP    1000
KR    1000
Name: count, dtype: int64

API keys exhausted: 0/7


In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import nltk
print(nltk.data.path)

['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [None]:
import nltk
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root

True

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from textblob import TextBlob
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')

df = pd.read_csv('/content/youtube_video_data_20250625_125145.csv')

print("Basic data cleaning...")
df['title'] = df['title'].str.lower()
df['description'] = df['description'].str.lower()

df['description'] = df['description'].fillna('')
df['tags'] = df['tags'].fillna('')

print("Extracting NLP features...")

sia = SentimentIntensityAnalyzer()
stop_words = set(stopwords.words('english'))

def extract_text_features(text):
    sentiment = sia.polarity_scores(text)

    blob = TextBlob(text)

    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words and w not in string.punctuation]

    return {
        'text_length': len(text),
        'word_count': len(words),
        'unique_words': len(set(words)),
        'avg_word_length': np.mean([len(w) for w in words]) if words else 0,
        'sentiment_pos': sentiment['pos'],
        'sentiment_neg': sentiment['neg'],
        'sentiment_neu': sentiment['neu'],
        'sentiment_compound': sentiment['compound'],
        'textblob_polarity': blob.sentiment.polarity,
        'textblob_subjectivity': blob.sentiment.subjectivity,
        'has_question': 1 if '?' in text else 0,
        'has_exclamation': 1 if '!' in text else 0,
        'has_url': 1 if ('http://' in text or 'https://' in text) else 0
    }

print("Processing titles...")
title_features = pd.DataFrame(df['title'].apply(extract_text_features).tolist())
title_features.columns = ['title_' + col for col in title_features.columns]

print("Processing descriptions...")
desc_features = pd.DataFrame(df['description'].apply(extract_text_features).tolist())
desc_features.columns = ['desc_' + col for col in desc_features.columns]

df = pd.concat([df, title_features, desc_features], axis=1)

print("Analyzing tags...")
df['tag_count'] = df['tags'].apply(lambda x: len(x.split(',')) if x else 0)

print("Calculating correlations...")

numerical_features = [
    'view_count', 'like_count', 'dislike_count', 'comment_count',
    'days_since_published', 'duration_seconds',
    'channel_subscribers', 'channel_views', 'channel_video_count',
    'title_text_length', 'title_word_count', 'title_unique_words',
    'title_avg_word_length', 'title_sentiment_pos', 'title_sentiment_neg',
    'title_sentiment_compound', 'title_textblob_polarity',
    'desc_text_length', 'desc_word_count', 'desc_unique_words',
    'desc_avg_word_length', 'desc_sentiment_pos', 'desc_sentiment_neg',
    'desc_sentiment_compound', 'desc_textblob_polarity',
    'tag_count'
]

numerical_features = [f for f in numerical_features if f in df.columns]

corr_matrix = df[numerical_features].corr()

print("Creating visualizations...")
plt.figure(figsize=(20, 15))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm',
            center=0, vmin=-1, vmax=1, linewidths=0.5)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.savefig('correlation_matrix.png')
plt.close()

if 'view_count' in corr_matrix.columns:
    view_correlations = corr_matrix['view_count'].sort_values(ascending=False)
    print("\nTop Positive Correlations with View Count:")
    print(view_correlations[view_correlations > 0.3].head(10))

    print("\nTop Negative Correlations with View Count:")
    print(view_correlations[view_correlations < -0.1].head(10))

print("Saving enhanced dataset...")
df.to_csv('youtube_data_with_nlp_features.csv', index=False)

print("\nAnalysis complete!")
print(f"Enhanced dataset saved with {len(df.columns)} features")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Basic data cleaning...
Extracting NLP features...
Processing titles...
Processing descriptions...
Analyzing tags...
Calculating correlations...
Creating visualizations...

Top Positive Correlations with View Count:
view_count                 1.000000
like_count                 0.994820
channel_subscribers        0.973231
comment_count              0.873276
channel_views              0.765968
title_textblob_polarity    0.510496
title_sentiment_pos        0.414290
Name: view_count, dtype: float64

Top Negative Correlations with View Count:
desc_avg_word_length   -0.104113
tag_count              -0.109293
title_text_length      -0.145141
Name: view_count, dtype: float64
Saving enhanced dataset...

Analysis complete!
Enhanced dataset saved with 56 features


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Loading dataset...")
df = pd.read_csv('youtube_data_with_nlp_features.csv')

print("\nExploratory Data Analysis...")

df = df.dropna(subset=['view_count'])
df = df[~np.isinf(df['view_count'])]

features = [
    'like_count', 'dislike_count', 'comment_count',
    'days_since_published', 'duration_seconds',
    'channel_subscribers', 'channel_views', 'channel_video_count',
    'title_text_length', 'title_word_count', 'title_unique_words',
    'title_avg_word_length', 'title_sentiment_pos', 'title_sentiment_neg',
    'title_sentiment_compound', 'title_textblob_polarity',
    'desc_text_length', 'desc_word_count', 'desc_unique_words',
    'desc_avg_word_length', 'desc_sentiment_pos', 'desc_sentiment_neg',
    'desc_sentiment_compound', 'desc_textblob_polarity',
    'tag_count'
]

features = [f for f in features if f in df.columns]

print("Missing values in selected features:")
print(df[features].isnull().sum())

for col in features:
    if df[col].dtype in ['int64', 'float64']:
        df[col] = df[col].fillna(df[col].median())

print("\nPerforming feature selection...")
X = df[features]
y = df['view_count']

selector = SelectKBest(score_func=f_regression, k=15)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()

print("Selected features:", selected_features)

X = df[selected_features]

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

print("\nTraining models...")
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

results = []

for name, model in zip(models.keys(), models.values()):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'name': name,
        'pipeline': pipeline,
        'mse': mse,
        'mae': mae,
        'r2': r2,
        'predictions': y_pred
    })

    print(f"\n{name} Results:")
    print(f"MSE: {mse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.2f}")

print("\nPerforming hyperparameter tuning...")
rf_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5]
}

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

grid_search = GridSearchCV(
    rf_pipeline,
    rf_params,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

tuned_mse = mean_squared_error(y_test, y_pred_tuned)
tuned_mae = mean_absolute_error(y_test, y_pred_tuned)
tuned_r2 = r2_score(y_test, y_pred_tuned)

print("\nTuned Random Forest Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"MSE: {tuned_mse:.2f}")
print(f"MAE: {tuned_mae:.2f}")
print(f"R2 Score: {tuned_r2:.2f}")

print("\nPerforming cross-validation...")
cv_scores = cross_val_score(
    best_model,
    X,
    y,
    cv=5,
    scoring='r2'
)
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

print("\nAnalyzing feature importance...")
if isinstance(best_model.named_steps['model'], RandomForestRegressor):
    importances = best_model.named_steps['model'].feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': selected_features,
        'Importance': importances
    }).sort_values('Importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

print("\nPerforming residual analysis...")
residuals = y_test - y_pred_tuned
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_tuned, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig('residual_plot.png')
plt.close()

print("\nFinal Model Evaluation Summary:")
print("Model: Tuned Random Forest")
print(f"MSE: {tuned_mse:.2f}")
print(f"MAE: {tuned_mae:.2f}")
print(f"R2 Score: {tuned_r2:.2f}")
print(f"Cross-validation R2 Mean: {cv_scores.mean():.2f}")
print("\nTop 5 Important Features:")
print(feature_importance.head().to_string())

print("\nSaving model...")
joblib.dump(best_model, 'youtube_views_model.pkl')

print("\nMachine Learning pipeline complete!")
print("Model saved as 'youtube_views_model.pkl'")
print("Visualizations saved: correlation_matrix.png, feature_importance.png, residual_plot.png")

Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'youtube_data_with_nlp_features.csv'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Loading dataset...")
df = pd.read_csv('youtube_data_with_nlp_features.csv')

print("\nExploratory Data Analysis...")
df = df.dropna(subset=['view_count'])
df = df[~np.isinf(df['view_count'])]

y = np.log1p(df['view_count'])

features = [
    'days_since_published', 'duration_seconds',
    'channel_subscribers', 'channel_views', 'channel_video_count',
    'title_text_length', 'title_word_count', 'title_unique_words',
    'title_avg_word_length', 'title_sentiment_pos', 'title_sentiment_neg',
    'title_sentiment_compound', 'title_textblob_polarity',
    'desc_text_length', 'desc_word_count', 'desc_unique_words',
    'desc_avg_word_length', 'desc_sentiment_pos', 'desc_sentiment_neg',
    'desc_sentiment_compound', 'desc_textblob_polarity',
    'tag_count'
]

features = [f for f in features if f in df.columns]

print("Missing values in selected features:")
print(df[features].isnull().sum())

for col in features:
    if df[col].dtype in ['int64', 'float64']:
        df[col] = df[col].fillna(df[col].median())

print("\nPerforming feature selection...")
X = df[features]

selector = SelectKBest(score_func=f_regression, k=15)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()

print("Selected features:", selected_features)

X = df[selected_features]

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

print("\nTraining models...")
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

results = []

for name, model in zip(models.keys(), models.values()):
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    y_test_inv = np.expm1(y_test)
    y_pred_inv = np.expm1(y_pred)

    mse = mean_squared_error(y_test_inv, y_pred_inv)
    mae = mean_absolute_error(y_test_inv, y_pred_inv)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    percentage_error = np.mean(np.abs((y_test_inv - y_pred_inv) / y_test_inv)) * 100

    results.append({
        'name': name,
        'pipeline': pipeline,
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'percentage_error': percentage_error,
        'predictions': y_pred
    })

    print(f"\n{name} Results:")
    print(f"MSE: {mse:,.2f}")
    print(f"MAE: {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2 Score: {r2:.2f}")
    print(f"Percentage Error: {percentage_error:.2f}%")

print("\nPerforming hyperparameter tuning...")
rf_params = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [10, 20, 30, None],
    'model__min_samples_split': [2, 5, 10]
}

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

grid_search = GridSearchCV(
    rf_pipeline,
    rf_params,
    cv=5,
    scoring='r2',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

y_test_inv_tuned = np.expm1(y_test)
y_pred_inv_tuned = np.expm1(y_pred_tuned)

tuned_mse = mean_squared_error(y_test_inv_tuned, y_pred_inv_tuned)
tuned_mae = mean_absolute_error(y_test_inv_tuned, y_pred_inv_tuned)
tuned_rmse = np.sqrt(tuned_mse)
tuned_r2 = r2_score(y_test, y_pred_tuned)
tuned_percentage_error = np.mean(np.abs((y_test_inv_tuned - y_pred_inv_tuned) / y_test_inv_tuned)) * 100

print("\nTuned Random Forest Results:")
print(f"Best parameters: {grid_search.best_params_}")
print(f"MSE: {tuned_mse:,.2f}")
print(f"MAE: {tuned_mae:,.2f}")
print(f"RMSE: {tuned_rmse:,.2f}")
print(f"R2 Score: {tuned_r2:.2f}")
print(f"Percentage Error: {tuned_percentage_error:.2f}%")

print("\nPerforming cross-validation...")
cv_scores = cross_val_score(
    best_model,
    X,
    y,
    cv=5,
    scoring='r2'
)
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

print("\nAnalyzing feature importance...")
if isinstance(best_model.named_steps['model'], RandomForestRegressor):
    importances = best_model.named_steps['model'].feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': selected_features,
        'Importance': importances
    }).sort_values('Importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=feature_importance)
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

print("\nPerforming residual analysis...")
residuals = y_test - y_pred_tuned
plt.figure(figsize=(10, 6))
plt.scatter(y_pred_tuned, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Log(View Count)')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig('residual_plot.png')
plt.close()

print("\nFinal Model Evaluation Summary:")
print("Model: Tuned Random Forest")
print(f"MSE: {tuned_mse:,.2f}")
print(f"MAE: {tuned_mae:,.2f}")
print(f"RMSE: {tuned_rmse:,.2f}")
print(f"R2 Score: {tuned_r2:.2f}")
print(f"Percentage Error: {tuned_percentage_error:.2f}%")
print(f"Cross-validation R2 Mean: {cv_scores.mean():.2f}")
print("\nTop 5 Important Features:")
print(feature_importance.head().to_string())

print("\nSaving model...")
joblib.dump(best_model, 'youtube_views_model_improved.pkl')

print("\nMachine Learning pipeline complete!")
print("Model saved as 'youtube_views_model_improved.pkl'")
print("Visualizations saved: feature_importance.png, residual_plot.png")

Loading dataset...

Exploratory Data Analysis...
Missing values in selected features:
days_since_published        0
duration_seconds            0
channel_subscribers         0
channel_views               0
channel_video_count         0
title_text_length           0
title_word_count            0
title_unique_words          0
title_avg_word_length       0
title_sentiment_pos         0
title_sentiment_neg         0
title_sentiment_compound    0
title_textblob_polarity     0
desc_text_length            0
desc_word_count             0
desc_unique_words           0
desc_avg_word_length        0
desc_sentiment_pos          0
desc_sentiment_neg          0
desc_sentiment_compound     0
desc_textblob_polarity      0
tag_count                   0
dtype: int64

Performing feature selection...
Selected features: ['days_since_published', 'channel_subscribers', 'channel_views', 'title_avg_word_length', 'title_sentiment_pos', 'title_sentiment_neg', 'title_sentiment_compound', 'title_textblob_polarity'

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Loading dataset...")
df = pd.read_csv('youtube_data_with_nlp_features.csv')

print("\nExploratory Data Analysis...")
df = df.dropna(subset=['view_count'])
df = df[~np.isinf(df['view_count'])]

view_count_99 = np.percentile(df['view_count'], 99)
df['view_count_capped'] = np.where(df['view_count'] > view_count_99, view_count_99, df['view_count'])
y = np.log1p(df['view_count_capped'])

base_features = [
    'days_since_published', 'duration_seconds',
    'channel_subscribers', 'channel_views', 'channel_video_count',
    'title_text_length', 'title_word_count', 'title_unique_words',
    'title_avg_word_length', 'title_sentiment_pos', 'title_sentiment_neg',
    'title_sentiment_compound', 'title_textblob_polarity',
    'desc_text_length', 'desc_word_count', 'desc_unique_words',
    'desc_avg_word_length', 'desc_sentiment_pos', 'desc_sentiment_neg',
    'desc_sentiment_compound', 'desc_textblob_polarity',
    'tag_count'
]
categorical_features = ['category_id', 'region']
text_features = ['title', 'description']

base_features = [f for f in base_features if f in df.columns]
categorical_features = [f for f in categorical_features if f in df.columns]

for col in base_features:
    if df[col].dtype in ['int64', 'float64']:
        df[col] = df[col].fillna(df[col].median())

print("\nEnhancing feature engineering...")

tfidf = TfidfVectorizer(max_features=100, stop_words='english')
title_tfidf = tfidf.fit_transform(df['title'].fillna('')).toarray()
desc_tfidf = tfidf.fit_transform(df['description'].fillna('')).toarray()
tfidf_df = pd.DataFrame(np.hstack((title_tfidf, desc_tfidf)), columns=[f'tfidf_{i}' for i in range(200)])

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_encoded = encoder.fit_transform(df[categorical_features])
cat_df = pd.DataFrame(cat_encoded, columns=encoder.get_feature_names_out(categorical_features))

X = pd.concat([df[base_features], tfidf_df, cat_df], axis=1)

print("\nPerforming feature selection...")
selector = SelectKBest(score_func=f_regression, k=20)
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()].tolist()

print("Selected features:", selected_features)
X = X[selected_features]

print("\nSplitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = Pipeline([
    ('scaler', StandardScaler())
])

print("\nTraining models for ensemble...")
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42)
}

pipelines = {}
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    pipeline.fit(X_train, y_train)
    pipelines[name] = pipeline
    y_pred = pipeline.predict(X_test)

    y_test_inv = np.expm1(y_test)
    y_pred_inv = np.expm1(y_pred)

    mse = mean_squared_error(y_test_inv, y_pred_inv)
    mae = mean_absolute_error(y_test_inv, y_pred_inv)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    weights = 1 / y_test_inv
    weighted_mae = np.average(np.abs(y_test_inv - y_pred_inv), weights=weights)

    results[name] = {
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'weighted_mae': weighted_mae
    }

    print(f"\n{name} Results:")
    print(f"MSE: {mse:,.2f}")
    print(f"MAE: {mae:,.2f}")
    print(f"RMSE: {rmse:,.2f}")
    print(f"R2 Score: {r2:.2f}")
    print(f"Weighted MAE: {weighted_mae:,.2f}")

ensemble_pred = (pipelines['Random Forest'].predict(X_test) + pipelines['XGBoost'].predict(X_test)) / 2
y_test_inv = np.expm1(y_test)
y_pred_ensemble_inv = np.expm1(ensemble_pred)

ensemble_mse = mean_squared_error(y_test_inv, y_pred_ensemble_inv)
ensemble_mae = mean_absolute_error(y_test_inv, y_pred_ensemble_inv)
ensemble_rmse = np.sqrt(ensemble_mse)
ensemble_r2 = r2_score(y_test, ensemble_pred)
ensemble_weights = 1 / y_test_inv
ensemble_weighted_mae = np.average(np.abs(y_test_inv - y_pred_ensemble_inv), weights=ensemble_weights)

print(f"\nEnsemble Results:")
print(f"MSE: {ensemble_mse:,.2f}")
print(f"MAE: {ensemble_mae:,.2f}")
print(f"RMSE: {ensemble_rmse:,.2f}")
print(f"R2 Score: {ensemble_r2:.2f}")
print(f"Weighted MAE: {ensemble_weighted_mae:,.2f}")

print("\nPerforming stabilized cross-validation...")
kf = KFold(n_splits=10, shuffle=True, random_state=42)
cv_scores = cross_val_score(pipelines['Random Forest'], X, y, cv=kf, scoring='r2')
print(f"Cross-validation R2 scores: {cv_scores}")
print(f"Mean CV R2 score: {cv_scores.mean():.2f} (+/- {cv_scores.std() * 2:.2f})")

print("\nAnalyzing feature importance...")
importances = pipelines['Random Forest'].named_steps['model'].feature_importances_
feature_importance = pd.DataFrame({
    'Feature': selected_features,
    'Importance': importances
}).sort_values('Importance', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance.head(10))
plt.title('Top 10 Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance222.png')
plt.close()

print("\nPerforming residual analysis...")
residuals = y_test - ensemble_pred
plt.figure(figsize=(10, 6))
plt.scatter(ensemble_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Log(View Count)')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.tight_layout()
plt.savefig('residual_plot222.png')
plt.close()

print("\nFinal Model Evaluation Summary:")
print("Model: Ensemble (Random Forest + XGBoost)")
print(f"MSE: {ensemble_mse:,.2f}")
print(f"MAE: {ensemble_mae:,.2f}")
print(f"RMSE: {ensemble_rmse:,.2f}")
print(f"R2 Score: {ensemble_r2:.2f}")
print(f"Weighted MAE: {ensemble_weighted_mae:,.2f}")
print(f"Cross-validation R2 Mean: {cv_scores.mean():.2f}")
print("\nTop 5 Important Features:")
print(feature_importance.head().to_string())

print("\nSaving model...")
joblib.dump(pipelines, 'youtube_views_model_enhanced222.pkl')

print("\nMachine Learning pipeline complete!")
print("Model saved as 'youtube_views_model_enhanced222.pkl'")
print("Visualizations saved: feature_importance.png, residual_plot.png")

Loading dataset...

Exploratory Data Analysis...

Enhancing feature engineering...

Performing feature selection...
Selected features: ['days_since_published', 'channel_subscribers', 'channel_views', 'title_sentiment_pos', 'title_textblob_polarity', 'tfidf_0', 'tfidf_2', 'tfidf_4', 'tfidf_49', 'tfidf_53', 'tfidf_98', 'tfidf_115', 'tfidf_149', 'tfidf_150', 'tfidf_161', 'tfidf_163', 'tfidf_169', 'tfidf_175', 'tfidf_181', 'tfidf_193']

Splitting data...

Training models for ensemble...

Random Forest Results:
MSE: 1,281,868,547,382.60
MAE: 402,124.43
RMSE: 1,132,196.34
R2 Score: 0.73
Weighted MAE: 204,898.46

XGBoost Results:
MSE: 1,153,882,086,663.11
MAE: 404,258.58
RMSE: 1,074,189.04
R2 Score: 0.68
Weighted MAE: 208,552.27

Ensemble Results:
MSE: 1,211,096,474,917.07
MAE: 402,031.90
RMSE: 1,100,498.28
R2 Score: 0.72
Weighted MAE: 200,580.86

Performing stabilized cross-validation...
Cross-validation R2 scores: [0.65830453 0.77903349 0.80534478 0.78861235 0.61793222 0.7153122
 0.75948216