Training file to detect pass or run

In [1]:
!pip install torch
!pip install numpy
!pip install opencv-python
!pip install torchvision



In [2]:
# Import neccessary libraries
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset
import numpy as np
import cv2


In [3]:
# Make video classifier 

class VideoClassifier(torch.nn.Module):
    def __init__(self, num_classes=2):
        super(VideoClassifier, self).__init__()
        # Define layers
        # Initialize the feature extractor (ResNet)
        # We load a pre-trained ResNet-18 model
        resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
        # Remove the final fully connected layer
        self.feature_extractor = torch.nn.Sequential(*list(resnet.children())[:-1])

        # The output of ResNet-50 is 2048-dimensional
        resnet_feature_size = 512

        # Create gru to process sequences of features
        self.gru = torch.nn.GRU(
            input_size=resnet_feature_size, 
            hidden_size=512, 
            num_layers=1, 
            batch_first=True, 
        )

        # Step 3: Initialize the final classification layer
        # This takes the output of the GRU and makes a prediction
        self.classifier = torch.nn.Linear(512, num_classes)

    def forward(self, x):
        # The input 'x' is expected to be a tensor of shape:
        # (batch_size, sequence_length, channels, height, width)
        
        # Get the dimensions for processing
        batch_size, sequence_length, C, H, W = x.size()

        # Reshape the input to process each frame through the ResNet
        x_reshaped = x.view(batch_size * sequence_length, C, H, W)
        
        # Pass the reshaped tensor through the feature extractor (ResNet)
        # This will get a feature vector for each frame
        features = self.feature_extractor(x_reshaped)
        
        # The features will be of shape (batch_size * sequence_length, 2048, 1, 1)
        # We need to flatten the last two dimensions to get a vector of size 2048
        features = features.view(batch_size, sequence_length, -1)

        # Pass the features through the GRU
        # The GRU processes the sequence of features
        gru_output, _ = self.gru(features)
        
        # We only need the output from the last timestep of the GRU
        # This is the final state after processing the whole play
        last_timestep_output = gru_output[:, -1, :]
        
        # Pass the last timestep output through the classifier to get the final prediction
        prediction = self.classifier(last_timestep_output)
        
        return prediction
    

In [4]:
# These are the standard normalization values for models pre-trained on ImageNet
IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

# The complete preprocessing pipeline for a pre-trained ResNet
preprocess = transforms.Compose([
    # Convert the NumPy array to a PIL Image object
    transforms.ToPILImage(),
    # Resize the image to the standard input size for ResNet
    transforms.Resize((224, 224)),
    # Convert the image to a PyTorch tensor
    transforms.ToTensor(),
    # Normalize the tensor's pixel values
    transforms.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD)
])

def load_video_as_tensor(video_path, preprocess_pipeline):
    """
    Loads a video, preprocesses each frame, and returns a single tensor.
    
    Args:
        video_path (str): The path to the video file.
        preprocess_pipeline (transforms.Compose): The transformations to apply to each frame.
        
    Returns:
        torch.Tensor: A tensor of shape (sequence_length, channels, height, width).
    """
    cap = cv2.VideoCapture(video_path)
    frames_list = []
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Convert the frame from BGR (OpenCV's default) to RGB
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Apply the preprocessing pipeline to the NumPy array
        frame_tensor = preprocess_pipeline(rgb_frame)
        frames_list.append(frame_tensor)
        
    cap.release()
    
    if frames_list:
        # Stack all the frame tensors into a single tensor
        return torch.stack(frames_list)
    else:
        return None

# --- Example Usage ---
# video_path = 'path/to/your/play.mp4'
# video_tensor = load_video_as_tensor(video_path, preprocess)

# if video_tensor is not None:
#     print(f"Loaded video tensor with shape: {video_tensor.shape}")

In [5]:
# The custom PyTorch Dataset class
class VideoDataset(Dataset):
    def __init__(self, video_paths, labels, transforms):
        self.video_paths = video_paths
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        video_path = self.video_paths[idx]
        label = self.labels[idx]
        
        video_tensor = load_video_as_tensor(video_path, self.transforms)
        
        return video_tensor, label



In [10]:
## 1. Setup: Define Hyperparameters and Device
learning_rate = 0.001
num_epochs = 10
batch_size = 2
frame_height = 224
frame_width = 224
num_channels = 3 # For RGB images


# Use GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create the Dataset and DataLoader
curr_drive = 'USCOffsensevClemsonPlays/plays_2nd_drive'
# Get every file from the drive folder using os library

import os

video_paths = [
    os.path.join(curr_drive, f) for f in os.listdir(curr_drive) if f.endswith('.mp4')
] # A list of all your video clip file paths
labels = [
    0, 1, 0, 1
]    # A list of corresponding labels (0 (run) or 1 (pass))

dataset = VideoDataset(video_paths, labels, transforms=preprocess)

from torch.utils.data import DataLoader
dataloader = DataLoader(
    dataset, 
    batch_size=batch_size, 
    shuffle=True,
    pin_memory=True,
    num_workers=4
    )
# Initialize the model, loss function, and optimizer
model = VideoClassifier(num_classes=2).to(device)
model.load_state_dict(torch.load('video_classifier_model_attempt_1.pth'))
for param in model.feature_extractor.parameters():
    param.requires_grad = False  # Freeze the feature extractor

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    filter(lambda p: p.requires_grad, model.parameters()), 
    lr=learning_rate)

# Training Loop
print("Starting training...")
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    for i, (inputs, labels) in enumerate(dataloader):
        # Move data to the device
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # The DataLoader may return None if a video is not loaded correctly
        if inputs is None:
            print(f"Skipping batch {i} due to loading error.")
            continue

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()

        del inputs, labels, outputs, loss  # Free up memory
        torch.cuda.empty_cache()  # Clear the GPU cache if using GPU
        
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}")

print("Training finished!")

Starting training...
Epoch [1/10], Loss: 1.0139
Epoch [2/10], Loss: 0.5345
Epoch [3/10], Loss: 0.3236
Epoch [4/10], Loss: 0.1410
Epoch [5/10], Loss: 0.1858
Epoch [6/10], Loss: 0.0404
Epoch [7/10], Loss: 0.0816
Epoch [8/10], Loss: 0.0114
Epoch [9/10], Loss: 0.0230
Epoch [10/10], Loss: 0.0078
Training finished!


In [7]:
# Save the model state for testing use

torch.save(model.state_dict(), 'video_classifier_model_attempt_1.pth')