In [None]:
import os
import pandas as pd
import numpy as np
import requests
from PIL import Image
from tqdm import tqdm
from sklearn.utils import class_weight


df = pd.read_csv("cleaned_dataset.csv")
df

for col in df.columns:
    print(repr(col))


# Create label column
df["is_viral"] = ((df["views"] > 100000) & (df["likes"] > 10000)).astype(int)

# Download thumbnails
os.makedirs("thumbnails", exist_ok=True)

def download_thumb(video_id, url):
    path = f"thumbnails/{video_id}.jpg"
    if not os.path.exists(path):
        try:
            r = requests.get(url, timeout=5)
            r.raise_for_status()
            with open(path, 'wb') as f:
                f.write(r.content)
        except:
            return None
    return path

df["thumbnail_path"] = df.apply(lambda row: download_thumb(row["video_id"], row["thumbnail_link"]), axis=1)
df = df[df["thumbnail_path"].notna()]  # Remove failed download



import tensorflow as tf
from tensorflow.keras import layers

AUTOTUNE = tf.data.AUTOTUNE

# Image Loading Function
def load_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [128, 128])  # Resize to uniform dimensions
    image = image / 255.0  # Normalize
    return image

# TextVectorization layer
vectorize_layer = layers.TextVectorization(
    max_tokens=2000,
    output_mode="int",
    output_sequence_length=30
)
vectorize_layer.adapt(df["title"])

# Create Dataset
def make_example(title, thumb_path, label):
    return {
        "title": title,
        "thumbnail": thumb_path
    }, label

ds = tf.data.Dataset.from_tensor_slices((df["title"], df["thumbnail_path"], df["is_viral"]))
ds = ds.map(lambda title, path, label: make_example(title, path, label))

# Load images & text
def preprocess(x, y):
    x["title"] = vectorize_layer(x["title"])
    x["thumbnail"] = load_image(x["thumbnail"])
    return x, tf.expand_dims(tf.cast(y, tf.float32), axis=-1)

ds = ds.map(preprocess).shuffle(1000).batch(32).prefetch(AUTOTUNE)


text_input = tf.keras.Input(shape=(None,), dtype="int32", name="title")
x_text = layers.Embedding(2000, 16)(text_input)
x_text = layers.GlobalAveragePooling1D()(x_text)

image_input = tf.keras.Input(shape=(128, 128, 3), name="thumbnail")
x_image = layers.Conv2D(32, 3, activation="relu")(image_input)
x_image = layers.MaxPooling2D()(x_image)
x_image = layers.Conv2D(64, 3, activation="relu")(x_image)
x_image = layers.GlobalAveragePooling2D()(x_image)

# Merge branches
x = layers.concatenate([x_text, x_image])
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, activation="sigmoid")(x)

model = tf.keras.Model(inputs={"title": text_input, "thumbnail": image_input}, outputs=output)


model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# Get label array
labels = df["is_viral"].values

# Calculate class_weight
weights = class_weight.compute_class_weight(
    class_weight="balanced",
    classes=np.unique(labels),
    y=labels
)
class_weights = {i: w for i, w in enumerate(weights)}
print("Class weights:", class_weights)

history = model.fit(ds, epochs=10, class_weight=class_weights)