<a href="https://colab.research.google.com/github/kagdelwarsejal/youtube-adview-predictor/blob/main/YouTube_Ads_View_Prediction_using_Gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.cm as cm
import matplotlib.pyplot as plt

# Remove the extra backslash at the end of the path
path = "/content/drive/MyDrive/ml internship projects/"
data_train = pd.read_csv(path+"train.csv")
data_train.head()
data_train.shape

(14999, 9)

In [None]:
data_train.head()

Unnamed: 0,vidid,adview,views,likes,dislikes,comment,published,duration,category
0,VID_18655,40,1031602,8523,363,1095,2016-09-14,PT7M37S,F
1,VID_14135,2,1707,56,2,6,2016-10-01,PT9M30S,D
2,VID_2187,1,2023,25,0,2,2016-07-02,PT2M16S,C
3,VID_23096,6,620860,777,161,153,2016-07-27,PT4M22S,H
4,VID_10175,1,666,1,0,0,2016-06-29,PT31S,D


In [None]:
!python --version


import os, re, json, math, warnings
warnings.filterwarnings("ignore")


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from joblib import dump, load
from datetime import datetime, timezone


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

Python 3.12.11


In [None]:
EXPECTED = {
'views': 'views',
'likes': 'likes',
'dislikes': 'dislikes',
'comment': 'comment',
'published': 'published',
'duration': 'duration',
'category': 'category'
}


# If your CSV uses different names, map them here, e.g.:
# data_train = data_train.rename(columns={'Comments': 'comment', 'Duration': 'duration'})


missing = [v for v in EXPECTED.values() if v not in data_train.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")


# Keep only what we need + optional vidid
cols = ['vidid'] if 'vidid' in data_train.columns else []
cols += list(EXPECTED.values())
df = data_train[cols].copy()
print(df.shape)

(14999, 8)


In [None]:
DUR_RE = re.compile(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?$")


def parse_duration_seconds(x: str) -> float:
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    m = DUR_RE.match(x)
    if not m:
        return np.nan
    h, m_, s = m.groups()
    h = int(h) if h else 0
    m_ = int(m_) if m_ else 0
    s = int(s) if s else 0
    return h*3600 + m_*60 + s

In [None]:
for c in ['views','likes','dislikes','comment']:
    df[c] = pd.to_numeric(df[c], errors='coerce')


# Dates
df['published'] = pd.to_datetime(df['published'], errors='coerce')


# Duration seconds
df['duration_seconds'] = df['duration'].apply(parse_duration_seconds)


# Basic clean: drop rows missing target or key features
before = len(df)
df = df.dropna(subset=['views','likes','dislikes','comment','published','duration_seconds','category']).copy()
after = len(df)
print(f"Dropped {before - after} invalid rows; remaining: {after}")

Dropped 362 invalid rows; remaining: 14637


In [None]:
# Create some robust engagement features
eng = (df['likes'].fillna(0) + df['dislikes'].fillna(0) + df['comment'].fillna(0))
df['engagement'] = eng


df['like_dislike_ratio'] = (df['likes'] + 1.0) / (df['dislikes'] + 1.0)
df['like_pct'] = (df['likes'] + 0.0) / (eng + 1.0)
df['dislike_pct'] = (df['dislikes'] + 0.0) / (eng + 1.0)
df['comment_pct'] = (df['comment'] + 0.0) / (eng + 1.0)


# Temporal features
ref_date = df['published'].max() # fixes a reference point based on the data itself
ref_date = pd.Timestamp(ref_date).tz_localize(None)


df['recency_days'] = (ref_date - df['published']).dt.days.clip(lower=0)
df['pub_year'] = df['published'].dt.year
df['pub_month'] = df['published'].dt.month
df['pub_dow'] = df['published'].dt.dayofweek


# Category one‑hot (based on observed categories)
cat_dummies = pd.get_dummies(df['category'].astype(str), prefix='cat')


FEATURES = [
'likes','dislikes','comment','duration_seconds','engagement',
'like_dislike_ratio','like_pct','dislike_pct','comment_pct',
'recency_days','pub_year','pub_month','pub_dow'
]
X = pd.concat([df[FEATURES].reset_index(drop=True), cat_dummies.reset_index(drop=True)], axis=1)
y = df['views'].astype(float).values


print(X.shape, len(y))
X.head(3)

(14637, 21) 14637


Unnamed: 0,likes,dislikes,comment,duration_seconds,engagement,like_dislike_ratio,like_pct,dislike_pct,comment_pct,recency_days,...,pub_month,pub_dow,cat_A,cat_B,cat_C,cat_D,cat_E,cat_F,cat_G,cat_H
0,8523.0,363.0,1095.0,457,9981.0,23.417582,0.853837,0.036365,0.109697,221,...,9,2,False,False,False,False,False,True,False,False
1,56.0,2.0,6.0,570,64.0,19.0,0.861538,0.030769,0.092308,204,...,10,5,False,False,False,True,False,False,False,False
2,25.0,0.0,2.0,136,27.0,26.0,0.892857,0.0,0.071429,295,...,7,5,False,False,True,False,False,False,False,False


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
X, y, test_size=0.2, random_state=RANDOM_STATE
)
X_train.shape, X_valid.shape

((11709, 21), (2928, 21))

In [None]:
reg = HistGradientBoostingRegressor(
max_depth=None,
learning_rate=0.06,
max_iter=500,
l2_regularization=0.0,
random_state=RANDOM_STATE
)


model = TransformedTargetRegressor(
regressor=reg,
func=np.log1p,
inverse_func=np.expm1
)


model.fit(X_train, y_train)

In [None]:
pred_valid = model.predict(X_valid)


mae = mean_absolute_error(y_valid, pred_valid)
rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
r2 = r2_score(y_valid, pred_valid)


print({"MAE": mae, "RMSE": rmse, "R2": r2})


# Quick sanity check on extreme values
pd.DataFrame({
'y_true': y_valid[:10],
'y_pred': pred_valid[:10].round(1)
})

{'MAE': 251131.21235335013, 'RMSE': np.float64(1154445.2729415325), 'R2': 0.720289667379306}


Unnamed: 0,y_true,y_pred
0,630898.0,713243.1
1,384.0,996.9
2,36772.0,36718.8
3,147885.0,453429.3
4,285699.0,239780.9
5,1024850.0,433525.9
6,710080.0,481048.7
7,2097376.0,1464111.3
8,299262.0,372217.8
9,358309.0,424428.2


In [None]:
model.fit(X, y)
ARTIFACT_DIR = 'artifact'
os.makedirs(ARTIFACT_DIR, exist_ok=True)
ARTIFACT_PATH = os.path.join(ARTIFACT_DIR, 'model.joblib')


dump({
'model': model,
'feature_names': X.columns.tolist(),
'ref_date': str(ref_date), # keep the reference used for recency calc
}, ARTIFACT_PATH)


print('Saved →', ARTIFACT_PATH)

Saved → artifact/model.joblib


In [None]:
# Example: Build features for a single row (mimicking training pipeline)


def build_features(rows: list, feature_names: list, ref_date_str: str):
    """rows: list of dicts with keys: likes, dislikes, comment, duration, published, category"""
    tmp = pd.DataFrame(rows)


    # Coerce numerics
    for c in ['likes','dislikes','comment']:
        tmp[c] = pd.to_numeric(tmp[c], errors='coerce').fillna(0)


    # Duration seconds
    tmp['duration_seconds'] = tmp['duration'].apply(parse_duration_seconds).fillna(0)


    # Dates and temporal features
    tmp['published'] = pd.to_datetime(tmp['published'], errors='coerce')
    ref_dt = pd.to_datetime(ref_date_str)
    tmp['recency_days'] = (ref_dt - tmp['published']).dt.days.clip(lower=0)
    tmp['pub_year'] = tmp['published'].dt.year
    tmp['pub_month'] = tmp['published'].dt.month
    tmp['pub_dow'] = tmp['published'].dt.dayofweek


    # Engagement features
    eng = (tmp['likes'] + tmp['dislikes'] + tmp['comment'])
    tmp['engagement'] = eng
    tmp['like_dislike_ratio'] = (tmp['likes'] + 1.0) / (tmp['dislikes'] + 1.0)
    tmp['like_pct'] = tmp['likes'] / (eng + 1.0)
    tmp['dislike_pct'] = tmp['dislikes'] / (eng + 1.0)
    tmp['comment_pct'] = tmp['comment'] / (eng + 1.0)


    # Categories (ensure same one‑hot columns as training)
    cats = pd.get_dummies(tmp['category'].astype(str), prefix='cat')


    base = tmp[[
    'likes','dislikes','comment','duration_seconds','engagement',
    'like_dislike_ratio','like_pct','dislike_pct','comment_pct',
    'recency_days','pub_year','pub_month','pub_dow']]


    Xnew = pd.concat([base, cats], axis=1)


    # Align to training features (add missing zeros, keep order)
    for f in feature_names:
        if f not in Xnew.columns:
            Xnew[f] = 0
    Xnew = Xnew[feature_names]


    return Xnew


# Load and predict
bundle = load(ARTIFACT_PATH)
mdl = bundle['model']
feats = bundle['feature_names']
refd = bundle['ref_date']


example_inputs = [{
    'likes': 1200,
    'dislikes': 35,
    'comment': 210,
    'duration': 'PT7M29S',
    'published': '2016-09-01',
    'category': 'A'
}]

Xinfer = build_features(example_inputs, feats, refd)
print('Prediction:', float(mdl.predict(Xinfer)[0]))

Prediction: 119446.44545357888


In [None]:
import joblib
import os

# Create artifacts folder if not exists
os.makedirs("artifacts", exist_ok=True)

# Save the trained model
joblib.dump(model, "artifacts/model.joblib")

print("✅ Model saved at: artifacts/model.joblib")


✅ Model saved at: artifacts/model.joblib


In [None]:
import joblib
import os

# Create artifacts folder if it doesn't exist
os.makedirs("artifacts", exist_ok=True)

# Save the trained model
joblib.dump(model, "artifacts/model.joblib")

print("Model saved successfully at artifacts/model.joblib")


Model saved successfully at artifacts/model.joblib


In [None]:
!ls artifacts


model.joblib


In [None]:
from google.colab import files
files.download("artifacts/model.joblib")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
%%writefile predict.py
import joblib
import numpy as np

# Load the trained model
model = joblib.load("artifacts/model.joblib")

# Feature order must match training
FEATURE_NAMES = [
    'views', 'likes', 'dislikes', 'comment_count',
    'published_month', 'published_year',
    'category_id', 'duration_seconds'
]

def predict(data):
    """
    data: dict containing feature values
    Example:
        {
            "views": 50000,
            "likes": 2000,
            "dislikes": 100,
            "comment_count": 300,
            "published_month": 5,
            "published_year": 2021,
            "category_id": 24,
            "duration_seconds": 210
        }
    """
    try:
        # Arrange features in correct order
        X = np.array([[data[feature] for feature in FEATURE_NAMES]])
        prediction = model.predict(X)
        return float(prediction[0])
    except Exception as e:
        return {"error": str(e)}

if __name__ == "__main__":
    # Example test run
    sample = {
        "views": 50000,
        "likes": 2000,
        "dislikes": 100,
        "comment_count": 300,
        "published_month": 5,
        "published_year": 2021,
        "category_id": 24,
        "duration_seconds": 210
    }
    print(predict(sample))


Writing predict.py


In [None]:
%%writefile requirements.txt
joblib
scikit-learn
numpy
pandas


Writing requirements.txt


In [None]:
%%writefile replicate.yaml
# Config file for Replicate deployment
# Tells Replicate how to build and run the model

name: youtube-adview-predictor
description: Predicts YouTube adview revenue based on video stats.
version: "1.0"

# Base image with Python pre-installed
python_version: "3.10"

# Install required libraries
install:
  - pip install -r requirements.txt

# The entry point for inference
run:
  command: ["python", "predict.py"]


Writing replicate.yaml


In [None]:
!ls


artifact   drive       replicate.yaml	 sample_data
artifacts  predict.py  requirements.txt


In [None]:
from google.colab import files
files.download("predict.py")
files.download("requirements.txt")
files.download("replicate.yaml")
files.download("artifacts/model.joblib")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download("requirements.txt")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download("replicate.yaml")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!git clone https://github.com/kagdelwarsejal/youtube-adview-predictor.git


Cloning into 'youtube-adview-predictor'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.


In [None]:
!mv artifacts/model.joblib youtube-adview-predictor/
!mv predict.py youtube-adview-predictor/
!mv requirements.txt youtube-adview-predictor/
!mv replicate.yaml youtube-adview-predictor/


In [None]:
%cd youtube-adview-predictor


/content/youtube-adview-predictor


In [None]:
!git init
!git add .
!git commit -m "Initial commit for replicate deployment"
!git branch -M main
!git remote add origin https://github.com/<your-username>/youtube-adview-predictor.git
!git push -u origin main


Reinitialized existing Git repository in /content/youtube-adview-predictor/.git/
Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@cc5dcfc6cf1e.(none)')
/bin/bash: line 1: your-username: No such file or directory
fatal: could not read Username for 'https://github.com': No such device or address


In [None]:
%%writefile requirements.txt
scikit-learn==1.3.2
pandas
numpy


Overwriting requirements.txt


In [None]:
%%writefile replicate.yaml
name: ml-model
description: "Machine Learning model deployed via Replicate"
runtime: python3.9
entrypoint: predict.py
packages:
  - scikit-learn==1.3.2
  - pandas
  - numpy


Overwriting replicate.yaml


In [None]:
!ls


model.joblib  predict.py  README.md  replicate.yaml  requirements.txt


In [None]:
import os
from getpass import getpass

# ======== SETUP ========
GITHUB_USER = "kagdelwarsejal"
GITHUB_REPO = "youtube-adview-predictor"
GITHUB_TOKEN = getpass("🔑 Enter your GitHub token: ")

# ======== CREATE REQUIREMENTS.TXT ========
with open("requirements.txt", "w") as f:
    f.write("scikit-learn==1.3.2\npandas\nnumpy\n")

# ======== CREATE REPLICATE.YAML ========
with open("replicate.yaml", "w") as f:
    f.write("""name: ml-model
description: "Machine Learning model deployed via Replicate"
runtime: python3.9
entrypoint: predict.py
packages:
  - scikit-learn==1.3.2
  - pandas
  - numpy
""")

# ======== CLONE REPO ========
os.system(f"git clone https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/{GITHUB_USER}/{GITHUB_REPO}.git")

# ======== MOVE FILES ========
os.system(f"cp -r artifacts {GITHUB_REPO}/")
os.system(f"cp predict.py requirements.txt replicate.yaml {GITHUB_REPO}/")

# ======== PUSH TO GITHUB ========
os.chdir(GITHUB_REPO)
os.system("git add .")
os.system('git commit -m "Added model artifacts and deployment files"')
os.system("git push origin main")


🔑 Enter your GitHub token: ··········


0

In [None]:
!zip -r replicate_model_files.zip artifacts predict.py requirements.txt replicate.yaml


  adding: predict.py (deflated 59%)
  adding: requirements.txt (stored 0%)
  adding: replicate.yaml (deflated 22%)


In [None]:
!pwd
!ls -R

/content/youtube-adview-predictor/youtube-adview-predictor
.:
predict.py  replicate_model_files.zip  requirements.txt
README.md   replicate.yaml


In [None]:
from google.colab import files
!zip -r replicate_files.zip predict.py replicate.yaml requirements.txt replicate_model_files.zip README.md
files.download("replicate_files.zip")


updating: predict.py (deflated 59%)
updating: replicate.yaml (deflated 22%)
updating: requirements.txt (stored 0%)
updating: replicate_model_files.zip (stored 0%)
updating: README.md (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>