In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Rescaling, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
# Defining constant variables
DRIVE_PATH = '/content/drive/MyDrive/Bundesliga/train.csv'
COMPRESSED_FOLDER_PATH = '/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/'
TRAIN_SIZE = 0.75
UNDERSAMPLING = True
FREE_RAM = True

In [3]:
def create_df_events():
  """Reading DataFrame and creating one with only relevant events"""
  df = pd.read_csv(DRIVE_PATH)
  df_events = df[~df['event_attributes'].isna()].copy()
  df_events.reset_index(drop=True, inplace=True)
  return df_events

In [4]:
# df_pass = df[df['event_attributes'].str[:6] == "['pass"]
# print(f"Accuracy if classifying every class as {df_events['event_attributes'].describe()['top']}: "
#       f"{df_events['event_attributes'].describe()['freq'] / df_events['event_attributes'].describe()['count']:.2%}")
# print(f'Accuracy if classifying every class as a pass (in general) {df_pass.shape[0] / df_events.shape[0]:.2%}')

In [5]:
def create_train_test_video_ids(df_events):
  """Creating two arrays with which videos are either part of the train or the test data"""
  video_ids = df_events['video_id'].unique().tolist()
  amount_videos = len(video_ids)
  amount_train = round(amount_videos * TRAIN_SIZE)
  train_videos = video_ids[:amount_train]
  test_videos = video_ids[amount_train:]
  return video_ids, train_videos, test_videos

In [6]:
def create_X(video_ids):
  """Creating X array with all pictures for both train and test data"""
  X = list()
  for video_id in video_ids:
    current_video = np.load(f'{COMPRESSED_FOLDER_PATH}{video_id}.npz')
    current_video = current_video.f.arr_0
    X.append(current_video)
  X = np.concatenate(X, axis=0)
  return X

In [7]:
def train_test_X_split(df_events, train_videos, test_videos):
  """Splitting pictures in train and test data"""
  train_idx = df_events[df_events['video_id'].isin(train_videos)].index
  test_idx = df_events[df_events['video_id'].isin(test_videos)].index
  X_train = X[train_idx]
  X_test = X[test_idx]
  assert df_events.shape[0] == X_train.shape[0] + X_test.shape[0]
  return X_train, X_test, train_idx, test_idx

In [8]:
def train_test_y_split(df_events, train_idx, test_idx):
  """Creating DataFrame of labels for both train and test data using videos ids"""
  y_train = df_events[df_events.index.isin(train_idx) ]['event'].to_frame()
  y_test = df_events[df_events.index.isin(test_idx) ]['event'].to_frame()
  return y_train, y_test

In [9]:
def create_array_to_drop_for_undersampling_in_play(y_train):
  """Creating array of indexes to drop to perform undersampling in column 'play'"""
  play_indexes = y_train[y_train['event'] == 'play'].index
  assert y_train.value_counts()['play'] == len(play_indexes)
  play_amount = y_train.value_counts()['play']
  challenge_amount = y_train.value_counts()['challenge']
  drop_play_indexes = np.sort(np.random.choice(a=play_indexes, 
                                              size=play_amount - challenge_amount, 
                                              replace=False))
  assert len(drop_play_indexes) == play_amount - challenge_amount
  return drop_play_indexes

In [10]:
def perform_undersampling(X_train, y_train, drop_play_indexes):
  """Performing undersampling in 'play' category to balance it with 'challenge'"""
  y_train.drop(drop_play_indexes, axis=0, inplace=True)
  X_train = np.delete(X_train, drop_play_indexes, axis=0)
  return X_train, y_train

In [11]:
def perform_ohe(y_train, y_test):
  """Performing one-hot-encoding on our labels"""
  current_feat = ['event']
  encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
  encoder.fit(y_train[current_feat]) # Fit and transform data
  y_train[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_train[current_feat])
  y_train = y_train.drop(columns=current_feat)

  encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
  encoder.fit(y_test[current_feat]) # Fit and transform data
  y_test[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_test[current_feat])
  y_test = y_test.drop(columns=current_feat)
  return y_train, y_test

In [12]:
def del_X(X):
  """Deleting variable to free RAM"""
  del X

In [16]:
df_events = create_df_events()
video_ids, train_videos, test_videos = create_train_test_video_ids(df_events)
X = create_X(video_ids)
X_train, X_test, train_idx, test_idx = train_test_X_split(df_events, train_videos, test_videos)
y_train, y_test = train_test_y_split(df_events, train_idx, test_idx)
drop_play_indexes = create_array_to_drop_for_undersampling_in_play(y_train)
if UNDERSAMPLING:
  X_train, y_train = perform_undersampling(X_train, y_train, drop_play_indexes)
y_train, y_test = perform_ohe(y_train, y_test)
if FREE_RAM:
  del_X(X)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1122, 480, 640, 3), (976, 480, 640, 3), (1122, 3), (976, 3))