In [None]:
import os, requests, numpy as np, pandas as pd, tensorflow as tf
from tqdm import tqdm
from sklearn.utils import class_weight
from tensorflow.keras import layers
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input

# Read & prepare data
df = pd.read_csv("cleaned_dataset.csv")
df["is_viral"] = ((df["views"] > 100_000) & (df["likes"] > 10_000)).astype(int)

os.makedirs("thumbnails", exist_ok=True)
def download_thumb(video_id, url):
    path = f"thumbnails/{video_id}.jpg"
    if not os.path.exists(path):
        try:
            r = requests.get(url, timeout=5); r.raise_for_status()
            with open(path, "wb") as f:  f.write(r.content)
        except Exception:  return None
    return path

df["thumbnail_path"] = df.apply(
    lambda r: download_thumb(r["video_id"], r["thumbnail_link"]), axis=1)
df = df[df["thumbnail_path"].notna()].reset_index(drop=True)

# tf.data pipeline
AUTOTUNE = tf.data.AUTOTUNE
IMG_SIZE = (224, 224)

def load_image(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, IMG_SIZE)
    img = preprocess_input(img)          # ResNet pre processing
    return img

# TextVectorization -> TF-IDF
VOCAB = 2000
vectorize = layers.TextVectorization(
    max_tokens=VOCAB,
    output_mode="tf_idf"     # direct output TF-IDF
)
vectorize.adapt(df["title"])

def make_example(title, path, label):
    return {"title": title, "thumbnail": path}, label

def preprocess(x, y):
    x["title"] = vectorize(x["title"])          # [batch, VOCAB] float
    x["thumbnail"] = load_image(x["thumbnail"]) # [batch,224,224,3]
    return x, tf.expand_dims(tf.cast(y, tf.float32), -1)

ds = (tf.data.Dataset.from_tensor_slices(
        (df["title"], df["thumbnail_path"], df["is_viral"]))
      .map(make_example, num_parallel_calls=AUTOTUNE)
      .map(preprocess, num_parallel_calls=AUTOTUNE)
      .shuffle(1024).batch(32).prefetch(AUTOTUNE))

# Model create
# Text branch
text_in   = tf.keras.Input(shape=(VOCAB,), dtype="float32", name="title")
x_text    = layers.Dense(256, activation="relu")(text_in)   # Dimensionality reduction
x_text    = layers.Dropout(0.3)(x_text)

# Image branch (pretrained ResNet)
img_in    = tf.keras.Input(shape=(*IMG_SIZE, 3), name="thumbnail")
base_cnn  = ResNet50(include_top=False, weights="imagenet", pooling="avg")
base_cnn.trainable = False                    # Phase 1 freeze first
x_img     = base_cnn(img_in, training=False)  # [batch, 2048]

# Merge & output
x = layers.concatenate([x_text, x_img])
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
out = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs={"title": text_in, "thumbnail": img_in}, outputs=out)

# Training 
labels = df["is_viral"].values
cw = class_weight.compute_class_weight("balanced", classes=np.unique(labels), y=labels)
class_weights = {i: w for i, w in enumerate(cw)}

# Phase 1: Train only top layers
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history_phase1 = model.fit(ds, epochs=3, class_weight=class_weights)

# Phase 2: Unfreeze last 10 layers for fine-tuning
for layer in base_cnn.layers[-10:]:
    layer.trainable = True
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),   # Small learning rate
    loss="binary_crossentropy",
    metrics=["accuracy"]
)
history_phase2 = model.fit(ds, epochs=5, class_weight=class_weights)