## Setup

In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


### Import libraries

In [2]:
import cv2
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [3]:
cv2.__version__

'4.6.0'

### Set constants

In [5]:
ROOT = "/content/drive/My Drive/ITC_Bundesliga"  # ROOT folder, where data file train.csv is located
VIDEOS_PATH = "/content/drive/My Drive/ITC_Bundesliga/train"
TABULAR_FILENAME = 'train.csv'

## Process tabular data: functions

In [28]:
def get_event_times(TABULAR_FILENAME):
    """ get times in ms of events from tabular data (train.csv file) """
    
    df = pd.read_csv(TABULAR_FILENAME)
    df['time_ms'] = round(df['time'] * 1000, 0).astype(int)
    video_ids = df['video_id'].unique()
    df_events = df[~df['event'].isin(['start', 'end'])].copy()
    df_events.loc[:, 'frame_id'] = df_events['video_id'] + df_events['time_ms'].astype(str)
    # times of events in ms
    times = df_events[df_events['video_id'] == video_id]['time_ms'].values

    return times

In [24]:
def get_positive_intervals(TABULAR_FILENAME):
    """ get intervals for events from tabular data (train.csv file) """
    
    df = pd.read_csv(TABULAR_FILENAME)
    df['time_ms'] = round(df['time'] * 1000, 0).astype(int)
    # get positive intervals
    df_starts = df[df['event']=='start'].copy()
    df_ends = df[df['event']=='end'].copy()

    df_positive_intervals = pd.DataFrame({'video_id': df_starts['video_id'].values,
                                        'start_time_ms': df_starts['time_ms'].values,
                                        'end_time_ms': df_ends['time_ms'].values})
    return df_positive_intervals

In [14]:
def get_neg_times(df_positive_intervals, video_id, size=100):
    """get random times of frames that do not fall in any event interval (negatives) for a given video
    function takes: 
        df_positive_intervals from function get_positive_intervals()
        video_id
        size: size of negative sample
    function returns times of the random sample
    """

    df_positive_intervals_cur = df_positive_intervals[df_positive_intervals['video_id']==video_id]
    t_min = df_positive_intervals_cur['start_time_ms'].min()
    t_max = df_positive_intervals_cur['end_time_ms'].max()

    neg_times = []

    while len(neg_times) < size:
        rand_n = np.random.randint(low=(t_min+1), high=t_max, dtype=int)
        is_in_pos_interval = ((rand_n >= df_positive_intervals_cur['start_time_ms']) & 
                            (rand_n <= df_positive_intervals_cur['end_time_ms'])).\
                            any()
        if (not is_in_pos_interval) and (rand_n not in neg_times):
            neg_times.append(rand_n)

    return neg_times

## Get frames: function

In [39]:
def get_frames_events(video_id, tabular_filename, videos_path, output_path, 
                      negatives=False, neg_size=None, npz=True, jpg=False, dim=None):
    """get frames of events for a given video (positive or negative classes),
       save npz and/or jpg files
       resize (optional)
       function takes:
            video_id
            tabular_filename: filename for tabular data
            videos_path: path to folder where the relevant video is located
            output_path: path to folder where npz and/or jpg files will be saved
            negatives: whether we are taking samples of negative class. 
                If True, we take only negatives, not positives. If False, we take only positives, not negatives.
            neg_size: size of sample of negatives
            npz: whether to save an npz file with results
            jpg: whether to save jpg files
            dim: dimensions of resized images (optional)
    """

    os.chdir(ROOT)
    # get times from tabular data
    if negatives:
        df_positive_intervals = get_positive_intervals(tabular_filename)
        times = get_neg_times(df_positive_intervals, video_id, neg_size)
    else:
        times = get_event_times(tabular_filename)
    n_frames = len(times)

    # capture video
    videoname = ''.join([video_id, '.mp4'])
    cap = cv2.VideoCapture(os.path.join(videos_path, videoname))

    # initialize result list
    res = []

    for i, time in enumerate(tqdm(times)):
        cap.set(cv2.CAP_PROP_POS_MSEC, time)  # move the time
        success, image = cap.read()
        if not success:
            print(f'{i+1} of {n_frames}, video_id: {video_id}, time: {time}, failed to read.')
        # resize
        if dim:
            image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
        if npz:
            res.append(image)
        if jpg:
            cv2.imwrite(os.path.join(output_path, f'{video_id}_{time}.jpg'), image)
        del image
    cap.release()
    if npz:
        np.savez_compressed(os.path.join(output_path, ''.join([video_id, '.npz'])), np.array(res))

    if negatives:
        return times

## Get frames and merge sequence of frames in gray scale: function

In [43]:
def get_frames_events_gray(video_id, tabular_filename, videos_path, output_path, frames=3,
                      negatives=False, neg_size=None, npz=True, jpg=False, dim=None):
    """get frames of events for a given video (positive or negative classes),
       get subsequent frames, convert to grayscale and merge them in the RGB channel. 
       save npz and/or jpg files
       resize (optional)
       function takes:
            video_id
            tabular_filename: filename for tabular data
            videos_path: path to folder where the relevant video is located
            output_path: path to folder where npz and/or jpg files will be saved
            frames: number of subsequent frames to take. Default: 3.
            negatives: whether we are taking samples of negative class. 
                If True, we take only negatives, not positives. If False, we take only positives, not negatives.
            neg_size: size of sample of negatives
            npz: whether to save an npz file with results
            jpg: whether to save jpg files
            dim: dimensions of resized images (optional)
    """
    os.chdir(ROOT)
    # get times from tabular data
    if negatives:
        df_positive_intervals = get_positive_intervals(tabular_filename)
        times = get_neg_times(df_positive_intervals, video_id, neg_size)
    else:
        times = get_event_times(tabular_filename)
    n_frames = len(times)

    # capture video
    videoname = ''.join([video_id, '.mp4'])
    cap = cv2.VideoCapture(os.path.join(videos_path, videoname))

    # initialize result list
    res = []

    for i, time in enumerate(tqdm(times)):
        cap.set(cv2.CAP_PROP_POS_MSEC, time)  # move the time
        gray_images = []
        for i in range(frames):
            success, image = cap.read()
            if not success:
                print(f'{i+1} of {n_frames}, video_id: {video_id}, time: {time}, failed to read.')
            gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
            gray_images.append(gray_image)
        # stack
        image = np.stack(gray_images, axis=2)
        # resize
        if dim:
            image = cv2.resize(image, dim, interpolation = cv2.INTER_AREA)
        if npz:
            res.append(image)
        if jpg:
            cv2.imwrite(os.path.join(output_path, f'{video_id}_{time}.jpg'), image)
        del image
    cap.release()
    if npz:
        np.savez_compressed(os.path.join(output_path, ''.join([video_id, '.npz'])), np.array(res))

    if negatives:
        return times