## Setup

In [2]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


### Import libraries

In [3]:
import cv2
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from timeit import default_timer as timer

In [3]:
cv2.__version__

'4.6.0'

### Set constants

In [8]:
ROOT = "/content/drive/My Drive/ITC_Bundesliga"  # ROOT folder, where data file train.csv is located
VIDEOS_PATH = "/content/drive/My Drive/ITC_Bundesliga/train"
TABULAR_FILENAME = '/content/drive/My Drive/ITC_Bundesliga/train.csv'

In [9]:
# tolerances in seconds
TOLERANCES = {
    "challenge": [0.3, 0.4, 0.5, 0.6, 0.7],
    "play": [0.15, 0.20, 0.25, 0.30, 0.35],
    "throwin": [0.15, 0.20, 0.25, 0.30, 0.35],
}

FPS = 25

TOLERANCE_INDEX = -1

In [10]:
CLASS_IDS = {'challenge': 0,
             'throwin': 1,
             'play': 2}

## Process tabular data: functions

In [7]:
def get_event_times(TABULAR_FILENAME):
    """ get times in ms of events from tabular data (train.csv file) """
    
    df = pd.read_csv(TABULAR_FILENAME)
    df['time_ms'] = round(df['time'] * 1000, 0).astype(int)
    video_ids = df['video_id'].unique()
    df_events = df[~df['event'].isin(['start', 'end'])].copy()
    df_events.loc[:, 'frame_id'] = df_events['video_id'] + df_events['time_ms'].astype(str)
    # times of events in ms
    times = df_events[df_events['video_id'] == video_id]['time_ms'].values

    return times

In [8]:
def get_positive_intervals(TABULAR_FILENAME):
    """ get intervals for events from tabular data (train.csv file) """
    
    df = pd.read_csv(TABULAR_FILENAME)
    df['time_ms'] = round(df['time'] * 1000, 0).astype(int)
    # get positive intervals
    df_starts = df[df['event']=='start'].copy()
    df_ends = df[df['event']=='end'].copy()

    df_positive_intervals = pd.DataFrame({'video_id': df_starts['video_id'].values,
                                        'start_time_ms': df_starts['time_ms'].values,
                                        'end_time_ms': df_ends['time_ms'].values})
    return df_positive_intervals

In [9]:
def get_neg_times(df_positive_intervals, video_id, size=100):
    """get random times of frames that do not fall in any event interval (negatives) for a given video
    function takes: 
        df_positive_intervals from function get_positive_intervals()
        video_id
        size: size of negative sample
    function returns times of the random sample
    """

    df_positive_intervals_cur = df_positive_intervals[df_positive_intervals['video_id']==video_id]
    t_min = df_positive_intervals_cur['start_time_ms'].min()
    t_max = df_positive_intervals_cur['end_time_ms'].max()

    neg_times = []

    while len(neg_times) < size:
        rand_n = np.random.randint(low=(t_min+1), high=t_max, dtype=int)
        is_in_pos_interval = ((rand_n >= df_positive_intervals_cur['start_time_ms']) & 
                            (rand_n <= df_positive_intervals_cur['end_time_ms'])).\
                            any()
        if (not is_in_pos_interval) and (rand_n not in neg_times):
            neg_times.append(rand_n)

    return neg_times

## Get frames: function

In [10]:
def get_frames_events(video_id, tabular_filename, videos_path, output_path, 
                      negatives=False, neg_size=None, npz=True, jpg=False, dim=None):
    """get frames of events for a given video (positive or negative classes),
       save npz and/or jpg files
       resize (optional)
       function takes:
            video_id
            tabular_filename: filename for tabular data
            videos_path: path to folder where the relevant video is located
            output_path: path to folder where npz and/or jpg files will be saved
            negatives: whether we are taking samples of negative class. 
                If True, we take only negatives, not positives. If False, we take only positives, not negatives.
            neg_size: size of sample of negatives
            npz: whether to save an npz file with results
            jpg: whether to save jpg files
            dim: dimensions of resized images (optional)
    """

    os.chdir(ROOT)
    # get times from tabular data
    if negatives:
        df_positive_intervals = get_positive_intervals(tabular_filename)
        times = get_neg_times(df_positive_intervals, video_id, neg_size)
    else:
        times = get_event_times(tabular_filename)
    n_frames = len(times)

    # capture video
    videoname = ''.join([video_id, '.mp4'])
    cap = cv2.VideoCapture(os.path.join(videos_path, videoname))

    # initialize result list
    res = []

    for i, time in enumerate(tqdm(times)):
        cap.set(cv2.CAP_PROP_POS_MSEC, time)  # move the time
        success, image = cap.read()
        if not success:
            print(f'{i+1} of {n_frames}, video_id: {video_id}, time: {time}, failed to read.')
        # resize
        if dim:
            image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
        if npz:
            res.append(image)
        if jpg:
            cv2.imwrite(os.path.join(output_path, f'{video_id}_{time}.jpg'), image)
        del image
    cap.release()
    if npz:
        np.savez_compressed(os.path.join(output_path, ''.join([video_id, '.npz'])), np.array(res))

    if negatives:
        return times

## Get frames and merge sequence of frames in gray scale: function

In [11]:
def get_frames_events_gray(video_id, tabular_filename, videos_path, output_path, frames=3,
                      negatives=False, neg_size=None, npz=True, jpg=False, dim=None):
    """get frames of events for a given video (positive or negative classes),
       get subsequent frames, convert to grayscale and merge them in the RGB channel. 
       save npz and/or jpg files
       resize (optional)
       function takes:
            video_id
            tabular_filename: filename for tabular data
            videos_path: path to folder where the relevant video is located
            output_path: path to folder where npz and/or jpg files will be saved
            frames: number of subsequent frames to take. Default: 3.
            negatives: whether we are taking samples of negative class. 
                If True, we take only negatives, not positives. If False, we take only positives, not negatives.
            neg_size: size of sample of negatives
            npz: whether to save an npz file with results
            jpg: whether to save jpg files
            dim: dimensions of resized images (optional)
    """
    os.chdir(ROOT)
    # get times from tabular data
    if negatives:
        df_positive_intervals = get_positive_intervals(tabular_filename)
        times = get_neg_times(df_positive_intervals, video_id, neg_size)
    else:
        times = get_event_times(tabular_filename)
    n_frames = len(times)

    # capture video
    videoname = ''.join([video_id, '.mp4'])
    cap = cv2.VideoCapture(os.path.join(videos_path, videoname))

    # initialize result list
    res = []

    for i, time in enumerate(tqdm(times)):
        cap.set(cv2.CAP_PROP_POS_MSEC, time)  # move the time
        gray_images = []
        for i in range(frames):
            success, image = cap.read()
            if not success:
                print(f'{i+1} of {n_frames}, video_id: {video_id}, time: {time}, failed to read.')
            gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
            gray_images.append(gray_image)
        # stack
        image = np.stack(gray_images, axis=2)
        # resize
        if dim:
            image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
        if npz:
            res.append(image)
        if jpg:
            cv2.imwrite(os.path.join(output_path, f'{video_id}_{time}.jpg'), image)
        del image
    cap.release()
    if npz:
        np.savez_compressed(os.path.join(output_path, ''.join([video_id, '.npz'])), np.array(res))

    if negatives:
        return times

## MS2 frames capture

In [11]:
tolerances_in_frames = {}
for event_type in TOLERANCES:
    event_tolerance_f = [FPS * tol for tol in TOLERANCES[event_type]]
    tolerances_in_frames[event_type] = event_tolerance_f

In [12]:
tolerances_in_frames

{'challenge': [7.5, 10.0, 12.5, 15.0, 17.5],
 'play': [3.75, 5.0, 6.25, 7.5, 8.75],
 'throwin': [3.75, 5.0, 6.25, 7.5, 8.75]}

In [13]:
effective_tolerances_in_frames = {}
for event_type in tolerances_in_frames:
    f = np.floor(tolerances_in_frames[event_type][TOLERANCE_INDEX]).astype(int)
    effective_tolerances_in_frames[event_type] = f
effective_tolerances_in_frames

{'challenge': 17, 'play': 8, 'throwin': 8}

In [14]:
df = pd.read_csv(TABULAR_FILENAME)
df['event_time_ms'] = round(df['time'] * 1000, 0).astype(int)

df_events = df[~df['event'].isin(['start', 'end'])].copy()
df_events['event_id'] = df_events.groupby('video_id').cumcount()
df_events['class_id'] = df_events['event'].map(lambda x: CLASS_IDS[x])

In [15]:
video_ids = df['video_id'].unique()

In [16]:
df_events

Unnamed: 0,video_id,time,event,event_attributes,event_time_ms,event_id,class_id
1,1606b0e6_0,201.150,challenge,['ball_action_forced'],201150,0,0
4,1606b0e6_0,210.870,challenge,['opponent_dispossessed'],210870,1,0
7,1606b0e6_0,219.230,throwin,['pass'],219230,2,1
10,1606b0e6_0,224.430,play,"['pass', 'openplay']",224430,3,2
13,1606b0e6_0,229.390,play,"['pass', 'openplay']",229390,4,2
...,...,...,...,...,...,...,...
11206,ecf251d4_0,3041.347,play,"['pass', 'openplay']",3041347,381,2
11209,ecf251d4_0,3050.347,play,"['pass', 'openplay']",3050347,382,2
11210,ecf251d4_0,3053.067,play,"['pass', 'openplay']",3053067,383,2
11213,ecf251d4_0,3056.587,challenge,['opponent_dispossessed'],3056587,384,0


Criterion: filter 3 tolerances, one before and two ahead.

In [17]:
video_ids

array(['1606b0e6_0', '1606b0e6_1', '35bd9041_0', '35bd9041_1',
       '3c993bd2_0', '3c993bd2_1', '407c5a9e_1', '4ffd5986_0',
       '9a97dae4_1', 'cfbe2e94_0', 'cfbe2e94_1', 'ecf251d4_0'],
      dtype=object)

In [19]:
def get_frames(video_id, df_events, videos_path, output_path, offset_start=17, n_frames=35):

    # time
    start = timer()

    # capture video
    videoname = ''.join([video_id, '.mp4'])
    cap = cv2.VideoCapture(os.path.join(videos_path, videoname))

    # df
    df = df_events[df_events['video_id'] == video_id]

    for i, time in enumerate(tqdm(df_events[df_events['video_id'] == video_id]['event_time_ms'])):

        # get event data fom tabular source
        event_id = df['event_id'].iloc[i]
        class_id = df['class_id'].iloc[i]

        cap.set(cv2.CAP_PROP_POS_MSEC, time)  # move the time
        event_pos = cap.get(cv2.CAP_PROP_POS_FRAMES) # get position of event in frame count
        start_pos = event_pos - offset_start  # calculate position of first frame to be captured, considering offset
        cap.set(cv2.CAP_PROP_POS_FRAMES, start_pos)  # move video capture to position of first frame to be captured (offset back)
        for j in range(n_frames):
            success, image = cap.read()
            if not success:
                print(f'{j+1} of {n_frames}, video_id: {video_id}, time: {time}, failed to read.')
            cv2.imwrite(os.path.join(output_path, f'{video_id}_{event_id}_{class_id}_{j}.jpg'), image)
    cap.release()

    # time
    end = timer()

    print(f'Done in {end} seconds.')

Create output folder.

In [None]:
video_id = 'cfbe2e94_1'
output_path = os.path.join('/content/drive/MyDrive/ITC_Bundesliga/frames_ms2', video_id)
os.mkdir(output_path)

In [None]:
get_frames(video_id, df_events, videos_path=VIDEOS_PATH, output_path=output_path)

  2%|▏         | 5/285 [00:21<19:18,  4.14s/it]

In [None]:
%cd {output_path}

/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/407c5a9e_1


In [None]:
! ls | wc -l

12250


In [None]:
drive.flush_and_unmount()

In [24]:
%cd /content/drive/MyDrive/ITC_Bundesliga/frames_ms2

/content/drive/MyDrive/ITC_Bundesliga/frames_ms2


In [21]:
! ls

1606b0e6_0  35bd9041_0	3c993bd2_0  407c5a9e_1	9a97dae4_1  cfbe2e94_1
1606b0e6_1  35bd9041_1	3c993bd2_1  4ffd5986_0	cfbe2e94_0  ecf251d4_0


In [25]:
! du -h

4.0G	./1606b0e6_1
5.0G	./35bd9041_0
5.9G	./35bd9041_1
6.6G	./3c993bd2_0
4.8G	./9a97dae4_1
5.8G	./1606b0e6_0
8.7G	./3c993bd2_1
5.0G	./407c5a9e_1
4.4G	./4ffd5986_0
5.4G	./cfbe2e94_0
5.0G	./cfbe2e94_1
7.2G	./ecf251d4_0
68G	.


In [23]:
for video_id in video_ids:
    path = os.path.join('/content/drive/MyDrive/ITC_Bundesliga/frames_ms2', video_id)
    %cd {path}
    ! ls | wc -l

/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/1606b0e6_0
13860
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/1606b0e6_1
9594
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/35bd9041_0
10633
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/35bd9041_1
12565
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/3c993bd2_0
12247
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/3c993bd2_1
16036
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/407c5a9e_1
12250
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/4ffd5986_0
10570
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/9a97dae4_1
10185
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/cfbe2e94_0
10675
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/cfbe2e94_1
9975
/content/drive/MyDrive/ITC_Bundesliga/frames_ms2/ecf251d4_0
13510


In [None]:
# drive.flush_and_unmount()

In [18]:
for video_id in video_ids:
    path = os.path.join('/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect', video_id, 'labels')
    %cd {path}
    ! ls | wc -l

/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/1606b0e6_0/labels
13860
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/1606b0e6_1/labels
9594
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/35bd9041_0/labels
10633
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/35bd9041_1/labels
12534
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/3c993bd2_0/labels
12247
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/3c993bd2_1/labels
16036
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/407c5a9e_1/labels
12250
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/4ffd5986_0/labels
10570
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/9a97dae4_1/labels
10185
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/cfbe2e94_0/labels
10675
/content/drive/MyDrive/ITC_Bundesliga/yolov7/yolov7/runs/detect/cfbe2e94_1/labels
9975
/content/drive/MyDrive/ITC_Bundesl