## Aditya Colab

In [None]:
# Libraries Used
# Core Libraries
import os
import re
import time
import tempfile
import shutil
import queue
import threading
import concurrent.futures

# Data Handling
import pandas as pd
import numpy as np

# Computer Vision
import cv2
from google.colab.patches import cv2_imshow
from skimage.metrics import structural_similarity as ssim
import matplotlib.pyplot as plt



# Web scraping
import requests
from bs4 import BeautifulSoup

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
# Install required packages quietly
!pip install -q ultralytics
!pip install -q mediapipe

# Mediapipe and YOLO
import mediapipe as mp
from ultralytics import YOLO

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Importing and Splitting Data
splits = {'train': 'data/train.json', 'validation': 'data/val.json', 'test': 'data/test.json'}
df = pd.read_json("hf://datasets/hbfreed/Picklebot-2M/" + splits["train"])
print("Data has been loaded")
print(df.shape)

Data has been loaded
(2116274, 11)


### EDA

In [None]:
#Do EDA, get number of unique pitcher, pitch types, strikes, balls
print("Number of unique pitchers:", df['pitcher'].nunique())
print("Count of strikes:", df[df['pitch_result'] != 'Ball'].shape[0])
print("Count of balls:", df[df['pitch_result'] == 'Ball'].shape[0])

# Get the number of unique pitches
print("Number of unique pitches:", df['pitch'].nunique())

# Get the video_link that has the string "a726eb60-fc99-45b1-b57f-485d4ff95ce5"
print("Video link with specific ID:", df[df['video_link'].str.contains("a726eb60-fc99-45b1-b57f-485d4ff95ce5")]['video_link'].values[0])

Number of unique pitchers: 1881
Count of strikes: 699313
Count of balls: 1416961
Number of unique pitches: 17
Video link with specific ID: https://baseballsavant.mlb.com/sporty-videos?playId=a726eb60-fc99-45b1-b57f-485d4ff95ce5


In [None]:
df.head()

Unnamed: 0,date,pitch,mph,spin_rate,pitcher,batter,zone,count,inning,pitch_result,video_link
0,2021-05-01,FF,96.6,2478,"Scott, Tanner(L)","Piscotty, Stephen(R)",12,1-2,Bot 8,Ball,https://baseballsavant.mlb.com/sporty-videos?p...
1,2021-06-03,KC,80.7,2663,"Workman, Brandon(R)","Alvarez, Yordan(L)",13,0-2,Bot 9,Ball,https://baseballsavant.mlb.com/sporty-videos?p...
2,2023-04-25,SI,91.0,2055,"Pérez, Martín(L)","Maile, Luke(R)",14,0-0,Bot 3,Ball,https://baseballsavant.mlb.com/sporty-videos?p...
3,2020-09-27,FF,93.5,1715,"Rodriguez, Nivaldo(R)","Odor, Rougned(L)",11,0-2,Bot 5,Ball,https://baseballsavant.mlb.com/sporty-videos?p...
4,2017-09-23,FF,99.2,2554,"Ellington, Brian(R)","Lamb, Jake(L)",11,2-0,Bot 6,Ball,https://baseballsavant.mlb.com/sporty-videos?p...


In [None]:
#get pitcher names who have thrown between 1200 and 1300 pitches
pitcher_counts = df['pitcher'].value_counts()
pitchers_1200_1300 = pitcher_counts[(pitcher_counts >= 1200) & (pitcher_counts <= 1300)].index.tolist()
print("Pitchers who have thrown between 1200 and 1300 pitches:", pitchers_1200_1300)

Pitchers who have thrown between 1200 and 1300 pitches: ['Thompson, Zach(R)', 'Blevins, Jerry(L)', 'Chargois, JT(R)', 'Phillips, Evan(R)', 'Hutchison, Drew(R)', 'Ventura, Yordano(R)', 'Drake, Oliver(R)', 'Staumont, Josh(R)', 'Hardy, Blaine(L)', 'Crawford, Kutter(R)', 'Bowman, Matt(R)', 'Suero, Wander(R)', 'Abreu, Bryan(R)', 'Moronta, Reyes(R)', 'Wilson, Alex(R)', 'Lambert, Peter(R)', 'Brown, Hunter(R)', 'Mize, Casey(R)', 'Jurado, Ariel(R)', 'Raley, Brooks(L)', 'Bielak, Brandon(R)', 'Osuna, Roberto(R)', 'Osich, Josh(L)', 'Adleman, Tim(R)', 'Tropeano, Nick(R)', 'Kela, Keone(R)', 'Kinley, Tyler(R)', 'May, Dustin(R)', 'Sparkman, Glenn(R)', 'Erlin, Robbie(L)', 'Johnson, Brian(L)', 'Underwood Jr., Duane(R)', 'Neverauskas, Dovydas(R)', 'Otero, Dan(R)', 'Middleton, Keynan(R)', 'Adams, Austin(R)', 'Bickford, Phil(R)', 'Falter, Bailey(L)', 'Payamps, Joel(R)', 'Oswalt, Corey(R)', 'Domínguez, Seranthony(R)']


In [None]:
# Number of pitchers between 1200 & 1300
len(pitchers_1200_1300)

41

In [None]:
#get all unique pitch types
pitch_types = df['pitch'].unique()
print("Unique pitch types:", pitch_types)


Unique pitch types: ['FF' 'KC' 'SI' 'SL' 'CU' 'CH' 'FC' 'FA' 'ST' 'FS' '' 'SV' 'KN' 'EP' 'SC' 'FO' 'CS']


In [None]:
#filter for specific pitcher
df = df[df['pitcher'] == 'May, Dustin(R)']

In [None]:
#get size
print(df.shape)

#get the data type of mph
print(df['mph'].dtype)

#convert mph to a float from a string
df['mph'] = pd.to_numeric(df['mph'], errors='coerce')
df = df.dropna(subset=['mph'])

print("Shape after dropping NA:", df.shape)
print("Average Speed (mph):", df['mph'].mean())

(1233, 11)
object
Shape after dropping NA: (1233, 11)
Average Speed (mph): 94.00145985401458


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mph'] = pd.to_numeric(df['mph'], errors='coerce')


In [None]:
#get counts of pitch_result
df['pitch_result'].value_counts()

Unnamed: 0_level_0,count
pitch_result,Unnamed: 1_level_1
Ball,796
Called Strike,437


In [None]:
#print the min, 25th, 50th, 75th and max percentiles of mph
print(df['mph'].describe())

count    1233.000000
mean       94.001460
std         4.565858
min        81.100000
25%        91.300000
50%        95.700000
75%        97.600000
max       100.500000
Name: mph, dtype: float64


In [None]:
"""
This script provides utility functions for downloading and processing baseball play videos
from MLB's Baseball Savant website.

Given a play ID or URL, it can:
1. Extract the direct video URL from the Baseball Savant webpage.
2. Download the video to a local file.
3. Convert the video into a NumPy array of frames, optionally resized or frame-limited.
4. Save the resulting NumPy array to disk for downstream machine learning or computer vision tasks.
"""

def get_video_url(play_id):
    # Given a play_id, constructs the MLB Baseball Savant video URL
    url = f"https://baseballsavant.mlb.com/sporty-videos?playId={play_id}"

    # Makes an HTTP GET request to fetch the HTML content of the page
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Looks for the <video> tag in the HTML
    video_tag = soup.find("video")
    if video_tag:
        # Within the video tag, find the <source> tag
        source_tag = video_tag.find("source")
        # If a source with a valid `src` attribute is found, return the video URL
        if source_tag and source_tag.get("src"):
            return source_tag["src"]
    return None  # Return None if no video URL is found


def get_video(url):
    # Same as get_video_url, but starts from a full URL instead of play_id
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    video_tag = soup.find("video")
    if video_tag:
        source_tag = video_tag.find("source")
        if source_tag and source_tag.get("src"):
            return source_tag["src"]
    return None


def download_video(video_url, output_file="temp.mp4"):
    # Downloads the video file from the given URL and saves it locally
    r = requests.get(video_url, stream=True)
    with open(output_file, "wb") as f:
        # Streams and writes the content in 1MB chunks
        for chunk in r.iter_content(chunk_size=1024*1024):
            if chunk:
                f.write(chunk)


def video_to_numpy(video_path, resize=None, max_frames=None):
    # Converts a video file to a numpy array of frames
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # Stop if no frame is read (end of video)
        if resize:
            # Resize the frame if a target size is provided
            frame = cv2.resize(frame, resize)
        frames.append(frame)
        count += 1
        if max_frames and count >= max_frames:
            break  # Stop after reading `max_frames` frames

    cap.release()  # Release video capture object
    return np.array(frames)  # Return frames as a numpy array


def save_numpy_array(array, filename="video_frames.npy"):
    # Saves a numpy array to a .npy file on disk
    np.save(filename, array)


Prompt:
The seconds when the pitcher is possibly in the middle, and the location of the pitchers change depending on the video. We may have to use ML for this.



ChatGPT said:
You're absolutely right — since the pitcher’s timing and location vary from video to video, a fixed crop won't work well. This is a classic use case for computer vision with machine learning. Here’s a structured way to approach it:

✅ Goal:
Automatically detect and crop the pitcher across different videos with varying camera angles and timings.

🧠 Solution Options:
Option 1: Pose Estimation (Recommended)
Use a pre-trained pose estimation model like:

OpenPose

MediaPipe Pose (Google)

YOLO + Keypoint detection

✅ Pros:

Accurate for identifying human bodies.

Can isolate the pitcher based on pose and movement patterns.

Option 2: Object Detection
Train or use a model (like YOLOv5, YOLOv8, or Detectron2) to detect “pitcher” as a custom class.

✅ Pros:

Works on frame-by-frame basis.

More precise than pose estimation in cluttered scenes.

❌ Cons:

Requires labeled data (you may need to annotate a small dataset of frames showing pitchers).

🔧 Workflow Example (with MediaPipe Pose in Colab):
python
Copy
Edit
import cv2
import mediapipe as mp

mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

def detect_pitcher_bbox(video_path, max_frames=100):
    cap = cv2.VideoCapture(video_path)
    bboxes = []

    for i in range(max_frames):
        ret, frame = cap.read()
        if not ret:
            break
        h, w, _ = frame.shape

        # Run pose estimation
        results = pose.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        if results.pose_landmarks:
            # Get min/max of landmark coordinates for bounding box
            x_coords = [lmk.x for lmk in results.pose_landmarks.landmark]
            y_coords = [lmk.y for lmk in results.pose_landmarks.landmark]
            xmin, xmax = min(x_coords), max(x_coords)
            ymin, ymax = min(y_coords), max(y_coords)

            # Convert to pixel coordinates
            x, y = int(xmin * w), int(ymin * h)
            width, height = int((xmax - xmin) * w), int((ymax - ymin) * h)
            bboxes.append((x, y, width, height))

    cap.release()
    return bboxes
This gives you approximate bounding boxes of the pitcher from the first max_frames frames.

🧪 Next Steps:
You can either:

Crop each frame using the detected bounding box.

Track the pitcher across frames.

Save the cropped regions into a new video or a NumPy array.

Let me know if you want help setting up pose-based cropping, YOLO training with pitcher labels, or a pipeline to auto-crop and save video segments.

In [None]:
# Initialize model once globally
model = YOLO('yolo11n.pt')  # You can replace this with another YOLO model variant

def detect_and_draw_all_humans(frame):
    """
    Detects all humans in the frame using YOLOv8.
    Crops 20% off each horizontal side before detection for focus.

    Args:
        frame (np.ndarray): BGR image frame from a video.

    Returns:
        frame (np.ndarray): The original frame (unaltered visually).
        detected_boxes (List[Tuple[int, int, int, int, str]]): Bounding boxes of detected people
            in (x1, y1, x2, y2, 'person') format in original frame coordinates.
    """
    h, w, _ = frame.shape
    left_crop = int(w * 0.2)
    right_crop = int(w * 0.8)

    # Crop 20% from both left and right sides for focused detection
    cropped_frame = frame[:, left_crop:right_crop]

    # Convert to RGB as required by YOLOv8
    rgb_frame = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)

    # Run YOLOv8 detection using GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    results = model.predict(rgb_frame, device=device, verbose=False)[0]

    detected_boxes = []
    for box in results.boxes:
        cls_id = int(box.cls[0])
        cls_name = model.names[cls_id]
        if cls_name == 'person':
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            # Adjust x-coordinates back to original frame
            x1 += left_crop
            x2 += left_crop

            # Optional: draw rectangle if you want to visualize
            # cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

            detected_boxes.append((x1, y1, x2, y2, cls_name))

    return frame, detected_boxes

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt'...


100%|██████████| 5.35M/5.35M [00:00<00:00, 105MB/s]


In [None]:
"""
This script processes videos from a folder by:
1. Converting them into numpy arrays.
2. Performing frame-wise similarity analysis using SSIM to detect transitions.
3. Cropping segments with meaningful action based on similarity thresholds.
4. Detecting and tracking the main person (pitcher) using YOLOv11.
5. Cropping and saving the final annotated video segments.
"""

# Load YOLOv11 model globally (auto selects GPU if available)
model = YOLO('yolo11n.pt')

# ----------- UTILITY FUNCTIONS ----------- #
def ssim_diff(frame1, frame2):
    grayA = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    grayB = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(grayA, grayB, full=True)
    return score

def gradient_analysis(frames_array):
    """Computes SSIM-based similarity gradients across video frames."""
    grad_arr = []
    for i in range(1, len(frames_array)):
        grad = ssim_diff(cv2.resize(frames_array[i], (100,100)), cv2.resize(frames_array[i-1], (100,100)))
        grad_arr.append(grad)
    return grad_arr

def crop_video(frames_array, grad_arr):
    """Recursively crops frames based on sharp changes in similarity."""
    if frames_array is None or len(frames_array) <= 10:
        return [frames_array]
    for i in range(len(grad_arr)):
        if grad_arr[i] < 0.4:
            return [frames_array[:i]] + crop_video(frames_array[i+2:], grad_arr[i+2:])
    return [frames_array]

def convert_video(frames_array, output_path):
    """Saves frames_array as a video to the given output_path."""
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, 20.0, (frames_array.shape[2], frames_array.shape[1]))
    for frame in frames_array:
        out.write(frame)
    out.release()
    return output_path

def video_to_numpy(video_path, resize=None, max_frames=None):
    """Converts a video file into a numpy array of frames."""
    cap = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if resize:
            frame = cv2.resize(frame, resize)
        frames.append(frame)
        count += 1
        if max_frames and count >= max_frames:
            break
    cap.release()
    return np.array(frames)

def detect_and_draw_all_humans(frame):
    """Detects all humans in a cropped portion of the frame using YOLOv8."""
    h, w, _ = frame.shape
    left_crop = int(w * 0.2)
    right_crop = int(w * 0.8)
    cropped_frame = frame[:, left_crop:right_crop]
    rgb_frame = cv2.cvtColor(cropped_frame, cv2.COLOR_BGR2RGB)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    results = model.predict(rgb_frame, device=device, verbose=False)[0]

    detected_boxes = []
    for box in results.boxes:
        cls_id = int(box.cls[0])
        cls_name = model.names[cls_id]
        if cls_name == 'person':
            x1, y1, x2, y2 = map(int, box.xyxy[0])
            x1 += left_crop
            x2 += left_crop
            detected_boxes.append((x1, y1, x2, y2, cls_name))
    return frame, detected_boxes

# ----------- MAIN PROCESSING FUNCTION ----------- #
def process_videos_in_folder(input_folder, output_folder):
    """
    Processes all .mp4 videos from input_folder, detects main person,
    crops and saves segments with significant motion.
    """
    if not os.path.isdir(input_folder):
        print(f"Error: Input folder not found at {input_folder}")
        return
    os.makedirs(output_folder, exist_ok=True)

    video_files = [f for f in os.listdir(input_folder) if f.endswith('.mp4')]
    if not video_files:
        print(f"No MP4 videos found in {input_folder}")
        return

    for video_file in video_files:
        video_path = os.path.join(input_folder, video_file)
        print(f"Processing video: {video_path}")

        frames_array = video_to_numpy(video_path)
        gradient_array = gradient_analysis(frames_array)
        cropped_frames = crop_video(frames_array, gradient_array)

        for i, frames_segment in enumerate(cropped_frames):
            if not frames_segment:
                continue

            mid_index = len(frames_segment) // 2
            mid_frame = frames_segment[mid_index]
            _, mid_detected_boxes = detect_and_draw_all_humans(mid_frame)

            candidates = [
                (int(x1), int(y1), int(x2), int(y2))
                for (x1, y1, x2, y2, cls) in mid_detected_boxes if cls == 'person']

            if not candidates:
                print("No humans in middle frame. Skipping segment.")
                continue

            def height(b): return b[3] - b[1]
            selected_box = max(candidates, key=height)

            def iou(b1, b2):
                xa, ya = max(b1[0], b2[0]), max(b1[1], b2[1])
                xb, yb = min(b1[2], b2[2]), min(b1[3], b2[3])
                inter_area = max(0, xb - xa) * max(0, yb - ya)
                area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
                area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
                union_area = area1 + area2 - inter_area
                return inter_area / union_area if union_area > 0 else 0

            bounding_boxes, annotated_frames = [], []
            for frame in frames_segment:
                annotated_frame, detected_boxes = detect_and_draw_all_humans(frame)
                candidates = [
                    (int(x1), int(y1), int(x2), int(y2))
                    for (x1, y1, x2, y2, cls) in detected_boxes if cls == 'person']

                best_box = max(candidates, key=lambda b: iou(b, selected_box), default=None) if candidates else None
                if best_box and iou(best_box, selected_box) >= 0.5:
                    bounding_boxes.append(best_box)
                else:
                    bounding_boxes.append(selected_box)
                annotated_frames.append(annotated_frame)

            if not bounding_boxes:
                print("No valid bounding boxes. Skipping.")
                continue

            x1s, y1s, x2s, y2s = zip(*bounding_boxes)
            crop_x1, crop_y1 = max(0, min(x1s)), max(0, min(y1s))
            crop_x2, crop_y2 = min(frame.shape[1], max(x2s)), min(frame.shape[0], max(y2s))

            if crop_y2 - crop_y1 > 550:
                print("Cropped region too tall. Skipping.")
                continue

            final_frames = [frame[crop_y1:crop_y2, crop_x1:crop_x2] for frame in annotated_frames]
            output_filename = f"cropped_{i}_{video_file}"
            output_path = os.path.join(output_folder, output_filename)
            convert_video(np.array(final_frames), output_path)
            print(f"Saved cropped video: {output_path}")

            # Free memory
            del frames_segment, final_frames, bounding_boxes, annotated_frames


In [None]:
def average_sampled_frames_from_array(frame_array, num_samples=10):
    """
    Averages sampled frames from a NumPy array of frames.

    Args:
        frame_array (np.ndarray): A NumPy array of shape (num_frames, height, width, channels).
        num_samples (int): Number of frames to sample and average.

    Returns:
        np.ndarray: The averaged frame resized to (100, 100).
    """
    total_frames = frame_array.shape[0]
    if total_frames < num_samples:
        raise ValueError("Array has fewer frames than the number of samples requested.")

    indices = np.linspace(0, total_frames - 1, num_samples, dtype=int)
    sampled_frames = frame_array[indices].astype(np.float32)

    avg_frame = np.mean(sampled_frames, axis=0)
    avg_frame = avg_frame.astype(np.uint8)

    # Resize to 100x100
    avg_frame = cv2.resize(avg_frame, (100, 100), interpolation=cv2.INTER_AREA)
    return avg_frame


In [None]:
from google.colab import files
uploaded = files.upload()

Saving cnn_model.pth to cnn_model.pth


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Define a simple CNN model architecture for binary classification
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # First convolutional layer: 3 input channels (RGB), 16 output channels, 3x3 kernel
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)  # Downsamples by factor of 2
        # Second convolutional layer: 16 input channels, 32 output channels
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        # Fully connected layer: flattening output of conv2 layer (assumes input size 100x100 -> 25x25 after pooling)
        self.fc1 = nn.Linear(32 * 25 * 25, 64)  # Intermediate dense layer
        self.fc2 = nn.Linear(64, 2)             # Final layer for 2 output classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # -> (B, 16, 50, 50)
        x = self.pool(F.relu(self.conv2(x)))  # -> (B, 32, 25, 25)
        x = x.view(-1, 32 * 25 * 25)           # Flatten before fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)                        # Output logits (for use with CrossEntropyLoss)
        return x

# ------- MODEL LOADING FOR INFERENCE ------- #

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the model
class_model = SimpleCNN()

# Load the trained weights
class_model.load_state_dict(torch.load("cnn_model.pth", map_location=device))

# Move model to correct device
class_model = class_model.to(device)

# Set the model to evaluation mode to deactivate dropout/batchnorm
class_model.eval()


SimpleCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (fc1): Linear(in_features=20000, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=2, bias=True)
)

In [None]:
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Device:", torch.cuda.get_device_name(0))
    print("CUDA Version:", torch.version.cuda)
    print("Device Count:", torch.cuda.device_count())

CUDA Available: True
GPU Device: Tesla T4
CUDA Version: 12.4
Device Count: 1


In [None]:
def process_numpy_video_for_inference(frame_array, class_model, video_name, num_samples=10, device='cuda'):
    """
    Processes a NumPy array of video frames for inference using the given model.

    Args:
        frame_array (np.ndarray): NumPy array of shape (num_frames, height, width, channels), in RGB format.
        model (torch.nn.Module): Trained PyTorch model for inference.
        video_name (str): The name of the original video (used for saving output).
        output_folder (str): Path to the output folder where results will be saved.
        num_samples (int): Number of frames to sample and average.
        device (str): 'cuda' or 'cpu'

    Returns:
        str: Predicted label ('pitching' or 'other').
    """
    try:

        # Compute average frame
        avg_frame = average_sampled_frames_from_array(frame_array, num_samples)
        avg_frame_tensor = torch.tensor(avg_frame).permute(2, 0, 1).unsqueeze(0).float() / 255.0
        avg_frame_tensor = avg_frame_tensor.to(device)

        # Predict
        class_model.eval()
        with torch.no_grad():
            output = class_model(avg_frame_tensor)
            _, predicted = torch.max(output, 1)

        label = 'pitching' if predicted.item() == 1 else 'other'


        print(f"Processed {video_name}: {label}")
        return label

    except Exception as e:
        print(f"Error processing {video_name}: {e}")
        return None


In [None]:
# Update output folder path
output_folder = "Video Outputs/"


def crop_video_size(frames_array, detect_and_draw_fn):
    """
    Processes a video in numpy array format, detects the lowest human in each frame,
    crops all frames to the bounding box covering all detected humans, and returns
    the cropped video as a numpy array.

    Args:
        frames_array (np.ndarray): Input video as a numpy array (frames, H, W, C).
        detect_and_draw_fn (function): Function to detect and return (frame, bbox) for each frame.

    Returns:
        np.ndarray: Cropped video as a numpy array.
    """
    bounding_boxes = []
    annotated_frames = []

    for frame in frames_array:
        annotated_frame, bbox = detect_and_draw_fn(frame)
        if bbox:
            bounding_boxes.append(bbox)
            annotated_frames.append(annotated_frame)

    if not bounding_boxes:
        print("No humans detected in this video. Returning original frames.")
        return frames_array

    x1s, y1s, x2s, y2s = zip(*bounding_boxes)
    crop_x1, crop_y1 = max(0, min(x1s)), max(0, min(y1s))
    crop_x2, crop_y2 = min(frames_array.shape[2], max(x2s)), min(frames_array.shape[1], max(y2s))

    final_frames = [
        frame[crop_y1:crop_y2, crop_x1:crop_x2] for frame in annotated_frames
    ]

    return np.array(final_frames)


In [None]:
#Function to flip the video frames horizontally
def flip_video_frames(frames_array):
    """
    Flips the video frames horizontally.

    Args:
        frames_array (np.ndarray): Input video as a numpy array (frames, H, W, C).

    Returns:
        np.ndarray: Flipped video as a numpy array.
    """
    return np.flip(frames_array, axis=2)  # Flip along the width dimension

In [None]:
print(f"Allocated memory: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Reserved memory:  {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

Allocated memory: 4.90 MB
Reserved memory:  22.00 MB


### Main Processing Function

In [None]:
def process_videos_from_df_numpy(df, starting_row, n_rows, output_folder):
    os.makedirs(output_folder, exist_ok=True)

    task_queue = queue.Queue(maxsize=1)
    stop_signal = object()

    def producer():
        print(f"Starting producer thread for rows {starting_row} to {starting_row + n_rows}")
        for index, row in df.iterrows():
            print(f"Processing row index: {index}, pitcher: {row.get('pitcher')}, video_link: {row.get('video_link')}")
            if index < starting_row:
                continue

            pitcher_name = row.get('pitcher')
            pitcher_name = re.sub(r'\W+', '', pitcher_name) if pitcher_name else "UnknownPitcher"
            pitcher_folder = os.path.join(output_folder, pitcher_name)
            os.makedirs(pitcher_folder, exist_ok=True)

            video_page_url = row.get('video_link')
            if pd.isna(video_page_url) or not video_page_url:
                print(f"Skipping row {index} due to missing video_link.")
                continue

            match = re.search(r"playId=([a-z0-9\-]+)", video_page_url)
            if not match:
                print(f"Could not extract playId from URL: {video_page_url}")
                continue

            play_id = match.group(1)
            video_url = get_video_url(play_id)

            if not video_url:
                print(f"No video URL found for playId: {play_id}")
                continue

            try:
                with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
                    download_video(video_url, tmp.name)
                    temp_video_path = tmp.name

                frames_array = video_to_numpy(temp_video_path)
                os.remove(temp_video_path)
                task_queue.put((index, row, frames_array, play_id, pitcher_folder, pitcher_name))

            except Exception as e:
                print(f"Error in producer for play_id {play_id}: {e}")
                continue

        task_queue.put(stop_signal)

    def consumer():
        while True:
            item = task_queue.get()
            if item is stop_signal:
                break

            index, row, frames_array, play_id, pitcher_folder, pitcher_name = item
            try:
                gradient_array = gradient_analysis(frames_array)
                cropped_segments = crop_video(frames_array, gradient_array)

                for k, segment in enumerate(cropped_segments):
                    try:
                        label = process_numpy_video_for_inference(segment, class_model, play_id, num_samples=10, device=next(class_model.parameters()).device)
                        if label != 'pitching':
                            print(f"Skipping segment {k} for play_id {play_id} as it is not a pitching video.")
                            continue

                        mid_index = len(segment) // 2
                        mid_frame = segment[mid_index]
                        mid_annotated, mid_boxes = detect_and_draw_all_humans(mid_frame)
                        candidates = [(int(x1), int(y1), int(x2), int(y2)) for (x1, y1, x2, y2, cls) in mid_boxes if cls == 'person']
                        candidates = [b for b in candidates if (b[3] - b[1]) > 75 and (b[2] - b[0]) > 75]

                        if not candidates:
                            print("No humans found in the middle frame. Skipping video.")
                            continue

                        def midpoint(b):
                          return (b[0] + b[2]) / 2, (b[1] + b[3]) / 2
                        mid_y = int(mid_frame.shape[0] * 0.65)
                        selected_box = min(candidates, key=lambda b: abs(midpoint(b)[1] - mid_y))
                        selected_box = tuple(map(int, selected_box))
                        print(f"Selected box for tracking: {selected_box}")

                        def iou(b1, b2):
                            xa, ya = max(b1[0], b2[0]), max(b1[1], b2[1])
                            xb, yb = min(b1[2], b2[2]), min(b1[3], b2[3])
                            inter_area = max(0, xb - xa) * max(0, yb - ya)
                            area1 = (b1[2] - b1[0]) * (b1[3] - b1[1])
                            area2 = (b2[2] - b2[0]) * (b2[3] - b2[1])
                            union_area = area1 + area2 - inter_area
                            return inter_area / union_area if union_area > 0 else 0

                        tracked_boxes = [None] * len(segment)
                        tracked_boxes[mid_index] = selected_box
                        annotated_frames = [None] * len(segment)

                        prev_box = selected_box
                        for i in range(mid_index, len(segment)):
                            frame = segment[i]
                            annotated, detected_boxes = detect_and_draw_all_humans(frame)
                            annotated_frames[i] = annotated
                            candidates = [(int(x1), int(y1), int(x2), int(y2)) for (x1, y1, x2, y2, cls) in detected_boxes if cls == 'person']
                            best_box = max(candidates, key=lambda b: iou(b, prev_box), default=None) if candidates else None
                            tracked_boxes[i] = best_box if best_box and iou(best_box, prev_box) >= 0.75 else prev_box
                            prev_box = tracked_boxes[i]

                        prev_box = selected_box
                        for i in range(mid_index - 1, -1, -1):
                            frame = segment[i]
                            annotated, detected_boxes = detect_and_draw_all_humans(frame)
                            annotated_frames[i] = annotated
                            candidates = [(int(x1), int(y1), int(x2), int(y2)) for (x1, y1, x2, y2, cls) in detected_boxes if cls == 'person']
                            best_box = max(candidates, key=lambda b: iou(b, prev_box), default=None) if candidates else None
                            tracked_boxes[i] = best_box if best_box and iou(best_box, prev_box) >= 0.75 else prev_box
                            prev_box = tracked_boxes[i]

                        x1s, y1s, x2s, y2s = zip(*tracked_boxes)
                        crop_x1, crop_y1 = max(0, min(x1s)), max(0, min(y1s))
                        crop_x2, crop_y2 = min(segment[0].shape[1], max(x2s)), min(segment[0].shape[0], max(y2s))
                        padding_x = int(0.1 * (crop_x2 - crop_x1))
                        padding_y = int(0.05 * (crop_y2 - crop_y1))
                        crop_x1 = max(0, crop_x1 - padding_x)
                        crop_y1 = max(0, crop_y1 - padding_y)
                        crop_x2 = min(segment[0].shape[1], crop_x2 + padding_x)
                        crop_y2 = min(segment[0].shape[0], crop_y2 + padding_y)

                        max_dim = max(crop_y2 - crop_y1, crop_x2 - crop_x1)
                        if (crop_y2 - crop_y1) < max_dim:
                            crop_y1 = (crop_y2 + crop_y1 - max_dim)//2
                            crop_y2 = (crop_y1 + max_dim + crop_y2)//2
                        if (crop_x2 - crop_x1) < max_dim:
                            crop_x1 = (crop_x2 + crop_x1 - max_dim)//2
                            crop_x2 = (crop_x1 + max_dim + crop_x2)//2

                        if (crop_y2 - crop_y1) < 100 or (crop_x2 - crop_x1) < 100:
                            print(f"Skipping segment {k} for play_id {play_id} due to small crop size.")
                            continue

                        final_frames = [frame[crop_y1:crop_y2, crop_x1:crop_x2] for frame in annotated_frames]

                        if pitcher_name.endswith('L'):
                            print(f"Flipping video frames for left-handed pitcher: {pitcher_name}")
                            final_frames = flip_video_frames(final_frames)

                        output_filename = f"cropped_{play_id}_{k}.mp4"
                        output_path = os.path.join(pitcher_folder, output_filename)
                        result_path = convert_video(np.array(final_frames), output_path)
                        print(f"Saved cropped video to: {result_path}")

                    except Exception as e:
                        print(f"Error processing segment {k} for play_id {play_id}: {e}")
                        continue

            except Exception as e:
                print(f"Error in consumer for play_id {play_id}: {e}")

    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        executor.submit(producer)
        executor.submit(consumer)


Error processing 8b3a1b77-4907-4928-9ac5-baec37877046: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Skipping segment 0 for play_id 8b3a1b77-4907-4928-9ac5-baec37877046 as it is not a pitching video.


In [None]:
#process_videos_from_df_numpy(df, 0, 1300, output_folder+f"cropping_videos/")
process_videos_from_df_numpy(df, 0, 5, output_folder+f"cropping_videos/")

Starting producer thread for rows 0 to 10
Processing row index: 121, pitcher: May, Dustin(R), video_link: https://baseballsavant.mlb.com/sporty-videos?playId=f5d533d9-0dec-44cf-9e44-1763025eeeb2
Processing row index: 903, pitcher: May, Dustin(R), video_link: https://baseballsavant.mlb.com/sporty-videos?playId=e8ffa775-fbb7-49be-99cd-b92484b1d18b
Error processing f5d533d9-0dec-44cf-9e44-1763025eeeb2: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Skipping segment 0 for play_id f5d533d9-0dec-44cf-9e44-1763025eeeb2 as it is not a pitching video.
Error processing f5d533d9-0dec-44cf-9e44-1763025eeeb2: Input type (torch.FloatTensor) and weight type (torch.cuda.FloatTensor) should be the same or input should be a MKLDNN tensor and weight is a dense tensor
Skipping segment 1 for play_id f5d533d9-0dec-44cf-9e44-1763025eeeb2 as it is not a pitching video.
Processing row index: 1188, pitche

KeyboardInterrupt: 

In [None]:
## To downlaod files to local compute
# Step 1: Zip the folder containing your output videos
shutil.make_archive("cropped_videos", 'zip', "Video Outputs/")

# Step 2: Download the zip file to your computer
files.download("cropped_videos.zip")

In [None]:

def process_video_from_url(video_page_url, output_folder, play_id_prefix="manual"):
    """
    Processes a single video URL: downloads, extracts, crops, and saves it.

    Args:
        video_page_url (str): URL to the video page.
        output_folder (str): Directory to save the cropped videos.
        play_id_prefix (str): Prefix for the play ID if not extracted from URL.
    """
    if not video_page_url:
        print("No video URL provided.")
        return

    match = re.search(r"playId=([a-z0-9\-]+)", video_page_url)
    play_id = match.group(1) if match else play_id_prefix

    print(f"Processing video with play_id: {play_id}")
    video_url = get_video_url(play_id)
    if not video_url:
        print(f"Could not retrieve video URL for play_id: {play_id}")
        return

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
        download_video(video_url, tmp.name)
        temp_video_path = tmp.name

    frames_array = video_to_numpy(temp_video_path)
    print(f"Frames array shape: {np.array(frames_array).shape}")

    gradient_array = gradient_analysis(frames_array)
    cropped_frames = crop_video(frames_array, gradient_array)

    for i, frames in enumerate(cropped_frames):
        output_filename = os.path.join(output_folder, f"cropped_{play_id}_{i}.mp4")
        output_vid = crop_video_size(np.array(frames), detect_and_draw_lowest_human)
        convert_video(output_vid, output_filename)
        print(f"Cropped video saved to: {output_filename}")

    os.remove(temp_video_path)

# Example usage: Process a single video URL
video_page_url = "https://baseballsavant.mlb.com/sporty-videos?playId=a726eb60-fc99-45b1-b57f-485d4ff95ce5"
output_folder = "Video Outputs/"

process_video_from_url(video_page_url, output_folder)


In [None]:

x = np.array([1, 2, 3])
t = torch.from_numpy(x)  # This should not raise an error


In [None]:
mp_pose = mp.solutions.pose
mp_drawing = mp.solutions.drawing_utils
pose_model = mp_pose.Pose(static_image_mode=False, model_complexity=1)

def draw_pose(frame):
    results = pose_model.process(frame)
    if results.pose_landmarks:
        mp_drawing.draw_landmarks(
            frame,
            results.pose_landmarks,
            mp_pose.POSE_CONNECTIONS,
            mp_drawing.DrawingSpec(color=(0, 255, 0), thickness=2),
            mp_drawing.DrawingSpec(color=(0, 0, 255), thickness=2)
        )
    return frame

def save_video(output_path, frames, fps):
    if not frames:
        print("No frames to save.")
        return

    h, w = frames[0].shape[:2]
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (w, h))

    for frame in frames:
        bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        out.write(bgr)

    out.release()
    print(f"✅ Saved pose-overlay video to: {output_path}")

def overlay_pose_and_save(video_path, output_path):
    video_array = video_to_numpy(video_path)
    posed_frames = []

    for frame in video_array:
        posed_frame = draw_pose(frame.copy())
        posed_frames.append(posed_frame)

    convert_video(np.array(posed_frames), output_path)
    print(f"Pose overlay video saved to: {output_path}")


In [None]:
# prompt: create a function that lists all the mp4 videos in a given directory that start with the word cropped
import os
from IPython.display import HTML, display
from base64 import b64encode

def list_cropped_mp4_videos(directory):
  """
  Lists all MP4 video files in a given directory that start with 'cropped'.

  Args:
    directory (str): The path to the directory to search.

  Returns:
    list: A list of filenames (strings) that match the criteria.
  """
  cropped_videos = []
  if not os.path.isdir(directory):
    print(f"Error: Directory not found at {directory}")
    return cropped_videos

  for filename in os.listdir(directory):
    if filename.startswith('cropped') and filename.endswith('.mp4'):
      cropped_videos.append(filename)

  return cropped_videos


def play_video_cv2(video_path, delay=30):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file")
        return

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        cv2_imshow(frame)
        time.sleep(delay / 10000.0)  # delay in seconds
    cap.release()

# Example usage:
cropped_video_files = list_cropped_mp4_videos(output_folder + "cropped_videos/")
print("Cropped MP4 videos found:")
for video_file in cropped_video_files:
  print(video_file)
  # === USAGE ===
  video_path = os.path.join(output_folder+"cropped_videos/", video_file)
  video_array = video_to_numpy(video_path)
  overlay_pose_and_save(video_path, os.path.join(output_folder+"posed_videos/", f"posed_{video_file}"))
  #display(play_video_cv2(os.path.join(output_folder+"posed_videos/", f"posed_{video_file}")))


In [None]:
print(torch.cuda.is_available())  # True if CUDA is available
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))
#put a torch tensor on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Example: Create a tensor and move it to the GPU
tensor = torch.tensor([1.0, 2.0, 3.0]).to(device)
# Check if the tensor is on the GPU
print(f"Tensor is on GPU: {tensor.is_cuda}")
# Check if the tensor is on the GPU


