In [24]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.model_selection import train_test_split

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using Device: {device}")

Using Device: cuda


In [25]:
!pip install openpyxl



In [26]:
df = pd.read_excel("../data/Train/labels_train.xlsx")
df.to_csv("labels.csv", index=False)

In [27]:
VIDEO_FOLDER = "../data/Train"  # Updated to correct path
LABELS_FILE = "labels.csv"    
IMG_SIZE = 224                
SEQ_LEN = 16                  
BATCH_SIZE = 4                
EPOCHS = 10                   
LR = 0.0001

In [28]:
class EngagementDataset(Dataset):
    def __init__(self, df, video_dir, transform=None):
        self.df = df
        self.video_dir = video_dir
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        vid_name = row['video']
        
        # 0 & 0.33 -> Class 0 (Low Engagement)
        # 0.66 & 1 -> Class 1 (High Engagement)
        label = 0.0 if float(row['label']) <= 0.33 else 1.0

        vid_path = os.path.join(self.video_dir, vid_name)
        if not os.path.exists(vid_path):
             base = vid_name.split('.')[0]
             for ext in ['.avi', '.wmv', '.webm', '.mp4']:
                 if os.path.exists(os.path.join(self.video_dir, base + ext)):
                     vid_path = os.path.join(self.video_dir, base + ext)
                     break

        frames = self._load_video(vid_path)

        if self.transform:
            frames = torch.stack([self.transform(f) for f in frames])
        else:
            to_tensor = transforms.ToTensor()
            frames = torch.stack([to_tensor(f) for f in frames])
            
        return frames, torch.tensor(label, dtype=torch.float32)

    def _load_video(self, path):
        cap = cv2.VideoCapture(path)
        frames = []
        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
        
        if total_frames == 0:
            # Return black frames if broken
            return [np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8)] * SEQ_LEN

        # Pick 16 frames evenly
        indices = np.linspace(0, total_frames-1, SEQ_LEN).astype(int)
        
        for i in range(total_frames):
            ret, frame = cap.read()
            if not ret: break
            if i in indices:
                frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(frame)
                if len(frames) == SEQ_LEN: break
        cap.release()

        while len(frames) < SEQ_LEN:
            frames.append(frames[-1] if frames else np.zeros((IMG_SIZE, IMG_SIZE, 3), dtype=np.uint8))
            
        return frames[:SEQ_LEN]

In [None]:
class ResNetLSTM(nn.Module):
    def __init__(self):
        super(ResNetLSTM, self).__init__()
        resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)

        self.features = nn.Sequential(*list(resnet.children())[:-1])
        

        self.lstm = nn.LSTM(input_size=512, hidden_size=128, batch_first=True)
        

        self.classifier = nn.Sequential(
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        batch_size, seq_len, c, h, w = x.size()
        c_in = x.view(batch_size * seq_len, c, h, w)
        

        f_out = self.features(c_in)
        f_out = f_out.view(batch_size, seq_len, 512)
        

        lstm_out, _ = self.lstm(f_out)
        

        final_state = lstm_out[:, -1, :] 
        return self.classifier(final_state)

In [None]:
if os.path.exists(LABELS_FILE):
    df = pd.read_csv(LABELS_FILE)
    train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)
    
    tfms = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((IMG_SIZE, IMG_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    
    train_ds = EngagementDataset(train_df, VIDEO_FOLDER, transform=tfms)
    val_ds = EngagementDataset(val_df, VIDEO_FOLDER, transform=tfms)
    
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    
    model = ResNetLSTM().to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=LR)
    
    print("Starting Training...")
    
    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        
        for frames, labels in train_loader:
            frames, labels = frames.to(device), labels.to(device).unsqueeze(1)
            
            optimizer.zero_grad()
            outputs = model(frames)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for frames, labels in val_loader:
                frames, labels = frames.to(device), labels.to(device).unsqueeze(1)
                outputs = model(frames)
                preds = torch.sigmoid(outputs) > 0.5
                correct += (preds == labels).sum().item()
                total += labels.size(0)
                
        print(f"Epoch {epoch+1} | Loss: {running_loss/len(train_loader):.4f} | Val Acc: {correct/total:.2%}")

    torch.save(model.state_dict(), "binary_model.pth")
    print("Model Saved.")

Starting Training...
Epoch 1 | Loss: 0.6788 | Val Acc: 53.33%
Epoch 2 | Loss: 0.6358 | Val Acc: 66.67%
Epoch 3 | Loss: 0.6109 | Val Acc: 66.67%
Epoch 4 | Loss: 0.5578 | Val Acc: 66.67%
Epoch 5 | Loss: 0.5534 | Val Acc: 66.67%
Epoch 6 | Loss: 0.4912 | Val Acc: 66.67%
Epoch 7 | Loss: 0.5810 | Val Acc: 66.67%
