Take cropped images, grayscale and stack them in the RGB channel.  

## Setup: libraries and gdrive mount

In [None]:
import cv2
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


check if cropped images file sync is ready

## Inputs/constants

In [None]:
NEW_CROPS_FOLDER = '/content/drive/MyDrive/ITC_Bundesliga/new_crops'
NEW_GRAY_FOLDER = '/content/drive/MyDrive/ITC_Bundesliga/gray'

In [None]:
CLASS_IDS = {0: 'challenge',
             1: 'throwin',
             2: 'play'}

In [None]:
# dataset split

train_videos = [
    '1606b0e6_0',
    '1606b0e6_1',
    'cfbe2e94_0',
    'cfbe2e94_1',
    '35bd9041_0',
    '35bd9041_1',
    '3c993bd2_0',
    '3c993bd2_1',
]
val_videos = [
    '9a97dae4_1',
    'ecf251d4_0',
]
test_videos = [
    '4ffd5986_0',
    '407c5a9e_1',
]

In [None]:
DATASET_SPLIT = {
    '1606b0e6_0': 'train',
    '1606b0e6_1': 'train',
    'cfbe2e94_0': 'train',
    'cfbe2e94_1': 'train',
    '35bd9041_0': 'train',
    '35bd9041_1': 'train',
    '3c993bd2_0': 'train',
    '3c993bd2_1': 'train',
    '9a97dae4_1': 'val',
    'ecf251d4_0': 'val',
    '4ffd5986_0': 'test',
    '407c5a9e_1': 'test'
}

In [None]:
video_ids = ['1606b0e6_0', '1606b0e6_1', '35bd9041_0', '35bd9041_1',
       '3c993bd2_0', '3c993bd2_1', '407c5a9e_1', '4ffd5986_0',
       '9a97dae4_1', 'cfbe2e94_0', 'cfbe2e94_1', 'ecf251d4_0']

In [None]:
# check no. of visible files per subfolder in NEW_CROPS_FOLDER
for video in video_ids:
    print(video)
    ! ls {os.path.join(NEW_CROPS_FOLDER, video)} | wc -l

1606b0e6_0
1188
1606b0e6_1
1464
35bd9041_0
1233
35bd9041_1
1077
3c993bd2_0
1242
3c993bd2_1
1128
407c5a9e_1
1050
4ffd5986_0
906
9a97dae4_1
873
cfbe2e94_0
915
cfbe2e94_1
855
ecf251d4_0
1158


Yes, sync is ready.

check frames per video_id

In [None]:
FRAMES_FOLDER = '/content/drive/MyDrive/ITC_Bundesliga/frames_ms2'

In [None]:
for video in video_ids:
    print(video)
    ! ls {os.path.join(FRAMES_FOLDER, video)} | wc -l

1606b0e6_0
13860
1606b0e6_1
17745
35bd9041_0
14385
35bd9041_1
12565
3c993bd2_0
14490
3c993bd2_1
13160
407c5a9e_1
12250
4ffd5986_0
10570
9a97dae4_1
10185
cfbe2e94_0
10675
cfbe2e94_1
9975
ecf251d4_0
13510


## Create relevant subfolders

In [None]:
%cd {NEW_GRAY_FOLDER}

/content/drive/MyDrive/ITC_Bundesliga/gray


In [None]:
# ! mkdir train
# ! mkdir val
# ! mkdir test

In [None]:
!ls

test  train  val


In [None]:
# ! mkdir train/challenge
# ! mkdir val/challenge
# ! mkdir test/challenge

In [None]:
# ! mkdir train/throwin
# ! mkdir val/throwin
# ! mkdir test/throwin

In [None]:
# ! mkdir train/play
# ! mkdir val/play
# ! mkdir test/play

In [None]:
! ls train

challenge  play  throwin


# Build dataframe from filenames in folder

From the folder with cropped images, we build a DataFrame to get the filenames for each of the three frames per event.

In [None]:
def get_filenames(folder_cropped):
    """ 
    take a folder with cropped images in subfolders per video_id
    assume there are three sequential cropped frames per event_id
    return df with filenames of crops per event_id
    """

    video_ids = os.listdir(folder_cropped)

    # get all filenames of all cropped images
    filenames = []
    for video_id in video_ids:
        filenames.extend(os.listdir(os.path.join(folder_cropped, video_id)))

    # prepare columns for df with info extracted from filenames
    video_id = ['_'.join([x.split('_')[0], x.split('_')[1]]) for x in filenames]
    event_id = [x.split('_')[2] for x in filenames]
    class_id = [x.split('_')[3] for x in filenames]
    order_id = [x.split('_')[4] for x in filenames]
    crop_id = [int(x.split('_')[-1][:(len(x.split('_')[-1]) - len('cropped.jpg'))]) for x in filenames]
    df = pd.DataFrame({'filename': filenames,
                'video_id': video_id,
                'event_id': event_id,
                'class_id': class_id,
                'order_id': order_id,
                'crop_id': crop_id})
    # group by crop id
    df_grouped = pd.DataFrame(df.groupby(['video_id', 'event_id', 'class_id', 'order_id'])['crop_id'].apply(list)).reset_index()
    def get_middle(x):
        y = x[:]
        y.remove(max(y))
        y.remove(min(y))
        return y[0]
    # assert not df_grouped[df_grouped['crop_id'].map(lambda x: len(x) != 3)]
    df_grouped['min_id'] = df_grouped['crop_id'].map(lambda x: min(x))
    df_grouped['max_id'] = df_grouped['crop_id'].map(lambda x: max(x))
    df_grouped['middle_id'] = df_grouped['crop_id'].map(lambda x: get_middle(x))
    df_grouped['min_filename'] = df_grouped['video_id'] + '_' + df_grouped['event_id'].astype(str) + '_' +  df_grouped['class_id'].astype(str) + \
    '_' + df_grouped['order_id'].astype(str) + '_' + df_grouped['min_id'].astype(str) + 'cropped.jpg'
    df_grouped['middle_filename'] = df_grouped['video_id'] + '_' + df_grouped['event_id'].astype(str) + '_' +  df_grouped['class_id'].astype(str) + \
    '_' + df_grouped['order_id'].astype(str) + '_' + df_grouped['middle_id'].astype(str) + 'cropped.jpg'
    df_grouped['max_filename'] = df_grouped['video_id'] + '_' + df_grouped['event_id'].astype(str) + '_' +  df_grouped['class_id'].astype(str) + \
    '_' + df_grouped['order_id'].astype(str) + '_' + df_grouped['max_id'].astype(str) + 'cropped.jpg'

    return df_grouped

In [None]:
df_grouped = get_filenames(folder_cropped=NEW_CROPS_FOLDER)
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4363 entries, 0 to 4362
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   video_id         4363 non-null   object
 1   event_id         4363 non-null   object
 2   class_id         4363 non-null   object
 3   order_id         4363 non-null   object
 4   crop_id          4363 non-null   object
 5   min_id           4363 non-null   int64 
 6   max_id           4363 non-null   int64 
 7   middle_id        4363 non-null   int64 
 8   min_filename     4363 non-null   object
 9   middle_filename  4363 non-null   object
 10  max_filename     4363 non-null   object
dtypes: int64(3), object(8)
memory usage: 375.1+ KB


In [None]:
df_grouped

Unnamed: 0,video_id,event_id,class_id,order_id,crop_id,min_id,max_id,middle_id,min_filename,middle_filename,max_filename
0,1606b0e6_0,0,0,17,"[17, 18, 19]",17,19,18,1606b0e6_0_0_0_17_17cropped.jpg,1606b0e6_0_0_0_17_18cropped.jpg,1606b0e6_0_0_0_17_19cropped.jpg
1,1606b0e6_0,1,0,18,"[16, 17, 15]",15,17,16,1606b0e6_0_1_0_18_15cropped.jpg,1606b0e6_0_1_0_18_16cropped.jpg,1606b0e6_0_1_0_18_17cropped.jpg
2,1606b0e6_0,10,2,25,"[22, 23, 24]",22,24,23,1606b0e6_0_10_2_25_22cropped.jpg,1606b0e6_0_10_2_25_23cropped.jpg,1606b0e6_0_10_2_25_24cropped.jpg
3,1606b0e6_0,100,2,30,"[27, 28, 29]",27,29,28,1606b0e6_0_100_2_30_27cropped.jpg,1606b0e6_0_100_2_30_28cropped.jpg,1606b0e6_0_100_2_30_29cropped.jpg
4,1606b0e6_0,101,2,17,"[17, 18, 19]",17,19,18,1606b0e6_0_101_2_17_17cropped.jpg,1606b0e6_0_101_2_17_18cropped.jpg,1606b0e6_0_101_2_17_19cropped.jpg
...,...,...,...,...,...,...,...,...,...,...,...
4358,ecf251d4_0,95,2,18,"[15, 16, 17]",15,17,16,ecf251d4_0_95_2_18_15cropped.jpg,ecf251d4_0_95_2_18_16cropped.jpg,ecf251d4_0_95_2_18_17cropped.jpg
4359,ecf251d4_0,96,2,18,"[15, 16, 17]",15,17,16,ecf251d4_0_96_2_18_15cropped.jpg,ecf251d4_0_96_2_18_16cropped.jpg,ecf251d4_0_96_2_18_17cropped.jpg
4360,ecf251d4_0,97,2,21,"[18, 19, 20]",18,20,19,ecf251d4_0_97_2_21_18cropped.jpg,ecf251d4_0_97_2_21_19cropped.jpg,ecf251d4_0_97_2_21_20cropped.jpg
4361,ecf251d4_0,98,2,22,"[19, 20, 21]",19,21,20,ecf251d4_0_98_2_22_19cropped.jpg,ecf251d4_0_98_2_22_20cropped.jpg,ecf251d4_0_98_2_22_21cropped.jpg


In [None]:
# check if there are indeed 3 frames per event
df_grouped[df_grouped['crop_id'].map(lambda x: len(x) != 3)]

Unnamed: 0,video_id,event_id,class_id,order_id,crop_id,min_id,max_id,middle_id,min_filename,middle_filename,max_filename


In [None]:
# unique video_id
df_grouped['video_id'].unique()

array(['1606b0e6_0', '1606b0e6_1', '35bd9041_0', '35bd9041_1',
       '3c993bd2_0', '3c993bd2_1', '407c5a9e_1', '4ffd5986_0',
       '9a97dae4_1', 'cfbe2e94_0', 'cfbe2e94_1', 'ecf251d4_0'],
      dtype=object)

## Create grayscaled images

In [None]:
# iterate over events grayscale stack and save each grayscaled image.

for i in tqdm(range(len(df_grouped))):
    # get data on event-frames
    video_id = df_grouped['video_id'].iloc[i]
    event_id = df_grouped['event_id'].iloc[i]
    class_id = df_grouped['class_id'].iloc[i]
    order_id = df_grouped['order_id'].iloc[i]
    filenames = [df_grouped['min_filename'].iloc[i], 
                 df_grouped['middle_filename'].iloc[i], 
                 df_grouped['max_filename'].iloc[i]
                 ]

    class_ = CLASS_IDS[int(class_id)]
    set_ = DATASET_SPLIT[video_id]

    export_filename = '_'.join([video_id, event_id, class_id, order_id]) + 'gray.jpg'
    gray_images = []
    for filename in filenames:
        img = cv2.imread(os.path.join(NEW_CROPS_FOLDER, video_id, filename))
        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray_images.append(img_gray)
    gray_images_stacked = np.stack(gray_images, axis=2)
    
    cv2.imwrite(os.path.join(NEW_GRAY_FOLDER, set_, class_, export_filename), gray_images_stacked)

100%|██████████| 488/488 [08:53<00:00,  1.09s/it]


In [None]:
# sync Colab and GDrive
drive.flush_and_unmount()

In [None]:
# check number of files NEW_GRAY_FOLDER in after grayscaling
filecount = []

for set_ in os.listdir(NEW_GRAY_FOLDER):
    for class_ in os.listdir(os.path.join(NEW_GRAY_FOLDER, set_)):
        filecount.append((set_, class_, len(os.listdir(os.path.join(NEW_GRAY_FOLDER, set_, class_)))))

In [None]:
filecount

[('train', 'challenge', 414),
 ('train', 'throwin', 129),
 ('train', 'play', 2490),
 ('val', 'challenge', 94),
 ('val', 'throwin', 22),
 ('val', 'play', 561),
 ('test', 'challenge', 113),
 ('test', 'throwin', 21),
 ('test', 'play', 518)]

In [None]:
sum([x[-1] for x in filecount])

4362