# Generating thumbnails for each News Video Clips by extracting the first frame

In [None]:
import os
import cv2
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm


input_folder = "./TVB Scraped Videos"
output_folder = "./thumbnails"


os.makedirs(output_folder, exist_ok=True)


def process_video(file_name):
    if file_name.endswith(".mp4"):  
        video_path = os.path.join(input_folder, file_name)
        output_file = os.path.join(output_folder, file_name.replace(".mp4", ".jpg"))

        
        cap = cv2.VideoCapture(video_path)
        success, frame = cap.read()  

        if success:
            
            cv2.imwrite(output_file, frame)
            return f"Saved first frame of {file_name} as {output_file}"
        else:
            return f"Failed to read video: {file_name}"

        
        cap.release()


files = [f for f in os.listdir(input_folder) if f.endswith(".mp4")]


with ThreadPoolExecutor(max_workers=16) as executor:
    results = list(tqdm(executor.map(process_video, files), total=len(files)))


for result in results:
    print(result)


100%|██████████| 150319/150319 [1:28:15<00:00, 28.39it/s]  
IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Generating Features for the first frame by using OpenAI's CLIP Pretrained Model

In [None]:
import os
import pandas as pd
import numpy as np
from PIL import Image
import torch
import clip
from tqdm import tqdm  


device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-L/14@336px", device=device)  


def load_images_and_extract_features(folder, model, preprocess, device):
    image_names = []
    features = []
    file_list = [f for f in os.listdir(folder) if f.endswith(('.png', '.jpg', '.jpeg'))]  

    
    for file_name in tqdm(file_list, desc="Processing images", unit="image"):
        image_path = os.path.join(folder, file_name)
        image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
        with torch.no_grad():
            feature = model.encode_image(image).cpu().numpy().flatten()
        image_names.append(file_name)
        features.append(feature)
    
    return pd.DataFrame(features, index=image_names, columns=[f"feature_{i}" for i in range(768)])


image_folder = "./TVB_clip_feature_extraction/thumbnails"  


df_features = load_images_and_extract_features(image_folder, model, preprocess, device)
df_features.index.name = "image_name"  
print(df_features.head())  


df_features.to_pickle("tvb_clip_features.pkl")

Processing images: 100%|██████████| 150316/150316 [1:19:47<00:00, 31.40image/s]


                              feature_0  feature_1  feature_2  feature_3  \
image_name                                                                 
59654a72e6038331360802e0.jpg   0.792969   0.591797  -0.376465   0.232056   
59654fede60383a6480802e0.jpg   0.248413   0.289551   0.216919   0.069458   
596556a2c5e16c4e33bc8475.jpg   0.489990   0.417480   0.080505   0.453125   
59655ff4e603831f360802e1.jpg   0.698242   0.916992   0.349121  -0.351318   
59656058c5e16c6152bc8474.jpg   0.030075   0.958984   0.365967   0.928711   

                              feature_4  feature_5  feature_6  feature_7  \
image_name                                                                 
59654a72e6038331360802e0.jpg  -0.377441  -0.150879  -0.351318   0.724121   
59654fede60383a6480802e0.jpg  -0.516113   0.211914   0.143677   0.669922   
596556a2c5e16c4e33bc8475.jpg  -0.350342  -0.025803   0.140381   0.560547   
59655ff4e603831f360802e1.jpg  -0.388184  -0.139893   0.382568   0.516602   
59656058c5e

# Manually Labelling 5,000 Thumbnails as "News Reporter Presented" and "News Reporter NOT Presented"

In [None]:
import os
import pandas as pd
from tkinter import Tk, Label, Button
from PIL import Image, ImageTk


image_folder = "./TVB_clip_feature_extraction/thumbnails"
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.png', '.jpg', '.jpeg'))]


label_file = "tvb_labels.csv"
if os.path.exists(label_file):
    df = pd.read_csv(label_file, index_col=0)  
else:
    df = pd.DataFrame({"image_name": image_files, "is_reporter": None})  



unlabeled_indices = df[df["is_reporter"].isna()].index.tolist()
current_index = unlabeled_indices[0] if unlabeled_indices else -1  


class LabelingApp:
    def __init__(self, root):
        self.root = root
        self.root.title("Image Labeling Tool")

        
        self.current_index = current_index
        self.df = df

        
        self.image_label = Label(root)
        self.image_label.pack()

        
        button_font = ("Helvetica", 16, "bold")  
        self.true_button = Button(root, text="True", command=self.mark_true, width=30, height=2, bg="lightgreen", font=button_font)
        self.true_button.pack(side="left", padx=20, pady=20)

        self.false_button = Button(root, text="False", command=self.mark_false, width=30, height=2, bg="lightcoral", font=button_font)
        self.false_button.pack(side="right", padx=20, pady=20)

        
        self.root.bind("<j>", lambda event: self.mark_true())
        self.root.bind("<k>", lambda event: self.mark_false())
        self.root.bind("<h>", lambda event: self.go_back())  

        
        self.update_image()

    def update_image(self):
        
        if self.current_index == -1:
            self.image_label.config(text="All images have been labeled!")
            self.true_button.config(state="disabled")
            self.false_button.config(state="disabled")
            return

        
        img_path = os.path.join(image_folder, self.df.iloc[self.current_index]["image_name"])
        img = Image.open(img_path)

        
        img = img.resize((854, 480))
        img = ImageTk.PhotoImage(img)

        
        self.image_label.config(image=img)
        self.image_label.image = img
        self.root.title(f"Image {self.current_index + 1}/{len(self.df)}")

    def mark_true(self):
        self.df.loc[self.current_index, "is_reporter"] = True
        self.save_and_next()

    def mark_false(self):
        self.df.loc[self.current_index, "is_reporter"] = False
        self.save_and_next()

    def go_back(self):
        
        if self.current_index > 0:
            self.current_index -= 1  
            self.df.loc[self.current_index, "is_reporter"] = None  
            self.update_image()

    def save_and_next(self):
        
        self.df.to_csv(label_file)

        
        unlabeled_indices = self.df[self.df["is_reporter"].isna()].index.tolist()
        self.current_index = unlabeled_indices[0] if unlabeled_indices else -1

        
        self.update_image()


root = Tk()
app = LabelingApp(root)
root.mainloop()


# Using a Simple Neural Network to Classify the videos as "News Reporter Presented" and "News Reporter NOT Presented"

In [None]:
import pandas as pd

In [None]:
clip_features = pd.read_pickle("tvb_clip_features.pkl")

In [None]:
print(clip_features.head(1))

                              feature_0  feature_1  feature_2  feature_3  \
image_name                                                                 
59654a72e6038331360802e0.jpg   0.792969   0.591797  -0.376465   0.232056   

                              feature_4  feature_5  feature_6  feature_7  \
image_name                                                                 
59654a72e6038331360802e0.jpg  -0.377441  -0.150879  -0.351318   0.724121   

                              feature_8  feature_9  ...  feature_758  \
image_name                                          ...                
59654a72e6038331360802e0.jpg   0.711914   -0.02507  ...    -0.115479   

                              feature_759  feature_760  feature_761  \
image_name                                                            
59654a72e6038331360802e0.jpg     -0.49585     0.415527     0.233276   

                              feature_762  feature_763  feature_764  \
image_name                              

In [None]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler


class ComplexClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim=512, output_dim=1, dropout_rate=0.5):
        super(ComplexClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc2(x))
        return x



labels_df = pd.read_csv("tvb_labels.csv", index_col=0)


features_df = clip_features


data = labels_df.merge(features_df, on="image_name")


data = data.dropna(subset=["is_reporter"])


X = data[[f"feature_{i}" for i in range(768)]].values
y = data["is_reporter"].astype(int).values  


scaler = StandardScaler()
X = scaler.fit_transform(X)


X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32).unsqueeze(1)


dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


input_dim = 768
model = ComplexClassifier(input_dim=input_dim, hidden_dim=512, output_dim=1, dropout_rate=0.5)
criterion = nn.BCELoss()  
optimizer = optim.Adam(model.parameters(), lr=0.001)


epochs = 100
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}")


model.eval()
y_true = []
y_pred = []
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()
        y_true.extend(targets.numpy())
        y_pred.extend(predictions.numpy())

from sklearn.metrics import classification_report, accuracy_score
print("Classification Report:")
print(classification_report(y_true, y_pred))
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")


torch.save(model.state_dict(), "complex_classifier.pth")
print("Model saved as complex_classifier.pth")


Epoch 1/100, Loss: 0.0726
Epoch 2/100, Loss: 0.0071
Epoch 3/100, Loss: 0.0033
Epoch 4/100, Loss: 0.0014
Epoch 5/100, Loss: 0.0008
Epoch 6/100, Loss: 0.0006
Epoch 7/100, Loss: 0.0005
Epoch 8/100, Loss: 0.0004
Epoch 9/100, Loss: 0.0003
Epoch 10/100, Loss: 0.0002
Epoch 11/100, Loss: 0.0002
Epoch 12/100, Loss: 0.0001
Epoch 13/100, Loss: 0.0001
Epoch 14/100, Loss: 0.0001
Epoch 15/100, Loss: 0.0001
Epoch 16/100, Loss: 0.0001
Epoch 17/100, Loss: 0.0001
Epoch 18/100, Loss: 0.0001
Epoch 19/100, Loss: 0.0000
Epoch 20/100, Loss: 0.0001
Epoch 21/100, Loss: 0.0000
Epoch 22/100, Loss: 0.0000
Epoch 23/100, Loss: 0.0000
Epoch 24/100, Loss: 0.0001
Epoch 25/100, Loss: 0.0000
Epoch 26/100, Loss: 0.0000
Epoch 27/100, Loss: 0.0000
Epoch 28/100, Loss: 0.0000
Epoch 29/100, Loss: 0.0000
Epoch 30/100, Loss: 0.0000
Epoch 31/100, Loss: 0.0000
Epoch 32/100, Loss: 0.0000
Epoch 33/100, Loss: 0.0000
Epoch 34/100, Loss: 0.0000
Epoch 35/100, Loss: 0.0000
Epoch 36/100, Loss: 0.0000
Epoch 37/100, Loss: 0.0000
Epoch 38/1

In [None]:
import pandas as pd
import torch
from sklearn.preprocessing import StandardScaler



labels_df = pd.read_csv("tvb_labels.csv", index_col=0)
features_df = clip_features


data = labels_df.merge(features_df, on="image_name", how="left")


labeled_data = data.dropna(subset=["is_reporter"])  
unlabeled_data = data[data["is_reporter"].isna()]  


class ComplexClassifier(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim=512, output_dim=1, dropout_rate=0.5):
        super(ComplexClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.relu = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.fc2 = torch.nn.Linear(hidden_dim, output_dim)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.sigmoid(self.fc2(x))
        return x


input_dim = 768
model = ComplexClassifier(input_dim=input_dim, hidden_dim=512, output_dim=1, dropout_rate=0.5)
model.load_state_dict(torch.load("complex_classifier.pth", weights_only=True))
model.eval()


if not unlabeled_data.empty:
    
    scaler = StandardScaler()
    X_labeled = labeled_data[[f"feature_{i}" for i in range(768)]]
    X_unlabeled = unlabeled_data[[f"feature_{i}" for i in range(768)]]

    
    scaler.fit(X_labeled)  
    X_unlabeled_scaled = scaler.transform(X_unlabeled)

    
    X_unlabeled_tensor = torch.tensor(X_unlabeled_scaled, dtype=torch.float32)

    
    with torch.no_grad():
        predictions = model(X_unlabeled_tensor).squeeze()
        predictions = (predictions > 0.5)  

    
    unlabeled_data = unlabeled_data.copy()  
    unlabeled_data.loc[:, "is_reporter"] = predictions.numpy()


labels_predicted_df = pd.concat([labeled_data, unlabeled_data]).sort_index()


labels_predicted_df = labels_predicted_df[["image_name", "is_reporter"]]


labels_predicted_df.to_csv("labels_predicted.csv", index=False)
print("Predicted labels saved to labels_predicted.csv")

Predicted labels saved to labels_predicted.csv
