## Collect Data

In [None]:
!wget -q --no-check-certificate https://www.crcv.ucf.edu/data/UCF101/UCF101.rar
!wget -q --no-check-certificate https://www.crcv.ucf.edu/data/UCF101/UCF101TrainTestSplits-RecognitionTask.zip

In [None]:
%%capture
!unrar e UCF101.rar data/
!unzip -qq UCF101TrainTestSplits-RecognitionTask.zip

## Imports

In [None]:
from imutils import paths
from tqdm import tqdm
import pandas as pd
import numpy as np
import shutil
import cv2
import os

## Metadata Loading

In [None]:

# List of target classes
target_classes = [
    "BaseballPitch",
    "CricketBowling",
    "CricketShot",
    "SoccerJuggling",
    "SoccerPenalty"
]

# Input and output file paths
data_filename = 'testlist03'
input_file = f"/content/ucfTrainTestlist/{data_filename}.txt"  # Change this to your input file path
output_file = f"/content/ucfTrainTestlist/filtered_{data_filename}.txt"  # Change this to your desired output file path

# Function to filter classes
def filter_classes(input_path, output_path, classes):
    try:
        with open(input_path, "r") as infile, open(output_path, "w") as outfile:
            for line in infile:
                # Extract the class name from the line
                class_name = line.split("/")[0]
                if class_name in classes:
                    outfile.write(line)
        print(f"Filtered list saved to {output_path}")
    except FileNotFoundError:
        print(f"File {input_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Call the function
filter_classes(input_file, output_file, target_classes)


Filtered list saved to /content/ucfTrainTestlist/filtered_testlist03.txt


In [None]:
import pandas as pd
import glob
import os

# Step 1: Define the pattern to match all relevant text files
# For example, if all your files are in the "ucfTrainTestlist" directory and start with "trainlist"
file_pattern = "ucfTrainTestlist/filtered_trainlist*.txt"

# Use glob to get all file paths matching the pattern
file_paths = glob.glob(file_pattern)

# Initialize an empty list to store all video names
all_videos = []

# (Optional) Initialize a list to store source file names if you want to track the origin
# source_files = []

# Step 2: Iterate over each file and extract video names
for path in file_paths:
    with open(path, "r") as f:
        # Read all lines, strip whitespace, and filter out any empty lines
        videos = [line.strip() for line in f if line.strip()]
        all_videos.extend(videos)

        # (Optional) If tracking source files
        # Repeat the source file name for each video extracted
        # source_files.extend([os.path.basename(path)] * len(videos))

# Step 3: Create a DataFrame with the collected video names
train = pd.DataFrame({
    'video_name': all_videos
    # 'source_file': source_files  # Uncomment if tracking source files
})

# Optional: Remove duplicate entries if necessary
train.drop_duplicates(inplace=True)

# Display the first few entries of the DataFrame
print(train.head())


                                    video_name
0  BaseballPitch/v_BaseballPitch_g08_c01.avi 7
1  BaseballPitch/v_BaseballPitch_g08_c02.avi 7
2  BaseballPitch/v_BaseballPitch_g08_c03.avi 7
3  BaseballPitch/v_BaseballPitch_g08_c04.avi 7
4  BaseballPitch/v_BaseballPitch_g08_c05.avi 7


In [None]:
print(len(train))
# train.to_csv('train.csv')

740


In [None]:


# Step 1: Define the pattern to match all relevant text files
# For example, if all your files are in the "ucfTrainTestlist" directory and start with "trainlist"
file_pattern = "ucfTrainTestlist/filtered_testlist*.txt"

# Use glob to get all file paths matching the pattern
file_paths = glob.glob(file_pattern)

# Initialize an empty list to store all video names
all_videos = []

# (Optional) Initialize a list to store source file names if you want to track the origin
source_files = []

# Step 2: Iterate over each file and extract video names
for path in file_paths:
    with open(path, "r") as f:
        # Read all lines, strip whitespace, and filter out any empty lines
        videos = [line.strip() for line in f if line.strip()]
        all_videos.extend(videos)

        # (Optional) If tracking source files
        # Repeat the source file name for each video extracted
        # source_files.extend([os.path.basename(path)] * len(videos))

# Step 3: Create a DataFrame with the collected video names
test = pd.DataFrame({
    'video_name': all_videos,
    # 'source_file': source_files  # Uncomment if tracking source files
})

# Optional: Remove duplicate entries if necessary
test.drop_duplicates(inplace=True)

# Display the first few entries of the DataFrame
print(test.head())


                                  video_name
0  BaseballPitch/v_BaseballPitch_g01_c01.avi
1  BaseballPitch/v_BaseballPitch_g01_c02.avi
2  BaseballPitch/v_BaseballPitch_g01_c03.avi
3  BaseballPitch/v_BaseballPitch_g01_c04.avi
4  BaseballPitch/v_BaseballPitch_g01_c05.avi


In [None]:
print(len(test))
# test.to_csv('test.csv')

617


## Utility Functions

In [None]:
def extract_tag(video_path):
    return video_path.split("/")[0]

def separate_video_name(video_name):
    return video_name.split("/")[1]

def rectify_video_name(video_name):
    return video_name.split(" ")[0]

import os
import shutil
from tqdm import tqdm

def move_videos(df, output_dir, source_dir="data"):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    for i in tqdm(range(df.shape[0]), desc="Copying videos", unit="file"):
        videoFile = df['video_name'][i].split("/")[-1]
        videoPath = os.path.join(source_dir, videoFile)

        if os.path.exists(videoPath):
            shutil.copy2(videoPath, output_dir)
        else:
            print(f"File not found: {videoPath}")

    print()
    print(f"Total videos: {len(os.listdir(output_dir))}")

## DataFrame Preparation

In [None]:
train["tag"] = train["video_name"].apply(extract_tag)
train["video_name"] = train["video_name"].apply(separate_video_name)
train.head()

Unnamed: 0,video_name,tag
0,v_BaseballPitch_g08_c01.avi 7,BaseballPitch
1,v_BaseballPitch_g08_c02.avi 7,BaseballPitch
2,v_BaseballPitch_g08_c03.avi 7,BaseballPitch
3,v_BaseballPitch_g08_c04.avi 7,BaseballPitch
4,v_BaseballPitch_g08_c05.avi 7,BaseballPitch


In [None]:
train["video_name"] = train["video_name"].apply(rectify_video_name)
train.head()

Unnamed: 0,video_name,tag
0,v_BaseballPitch_g08_c01.avi,BaseballPitch
1,v_BaseballPitch_g08_c02.avi,BaseballPitch
2,v_BaseballPitch_g08_c03.avi,BaseballPitch
3,v_BaseballPitch_g08_c04.avi,BaseballPitch
4,v_BaseballPitch_g08_c05.avi,BaseballPitch


In [None]:
test["tag"] = test["video_name"].apply(extract_tag)
test["video_name"] = test["video_name"].apply(separate_video_name)
test.head()

Unnamed: 0,video_name,tag
0,v_BaseballPitch_g01_c01.avi,BaseballPitch
1,v_BaseballPitch_g01_c02.avi,BaseballPitch
2,v_BaseballPitch_g01_c03.avi,BaseballPitch
3,v_BaseballPitch_g01_c04.avi,BaseballPitch
4,v_BaseballPitch_g01_c05.avi,BaseballPitch


In [None]:
train_new = train.reset_index(drop=True)
test_new = test.reset_index(drop=True)

In [None]:
move_videos(train_new, "train")
move_videos(test_new, "test")

Copying videos: 100%|██████████| 740/740 [00:00<00:00, 942.69file/s]



Total videos: 740


Copying videos: 100%|██████████| 617/617 [00:00<00:00, 741.46file/s]


Total videos: 617





In [None]:
from sklearn.model_selection import train_test_split

# Combine the existing train and test dataframes
all_data = pd.concat([train_new, test_new], ignore_index=True)

# Perform an 80:20 split
train_mew, test_mew = train_test_split(all_data, test_size=0.2, random_state=42, shuffle=True)

# Reset indices for the new dataframes
train_mew = train_mew.reset_index(drop=True)
test_mew = test_mew.reset_index(drop=True)

# Display the sizes of the new splits
print(f"Train size: {len(train_mew)}")
print(f"Test size: {len(test_mew)}")

Train size: 1085
Test size: 272


In [None]:
train_mew.to_csv("train.csv", index=False)
test_mew.to_csv("test.csv", index=False)

## Serialization

In [None]:
!tar cf ucf101_sports_new.tar.gz train test train.csv test.csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp ucf101_sports_new.tar.gz /content/drive/MyDrive