In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Rescaling, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Defining constant variables
DRIVE_PATH = '/content/drive/MyDrive/Bundesliga/train.csv'
POSITIVES_FOLDER_PATH = '/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/'
NEGATIVES_FOLDER_PATH = '/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/Negatives/'
TRAIN_SIZE = 0.75
NEGATIVES = True
UNDERSAMPLING = False
FREE_RAM = True

In [4]:
def create_df_events():
  """Reading DataFrame and creating one with only relevant events"""
  df = pd.read_csv(DRIVE_PATH)
  df_events = df[~df['event_attributes'].isna()].copy()
  df_events.reset_index(drop=True, inplace=True)
  return df_events

In [5]:
# df_pass = df[df['event_attributes'].str[:6] == "['pass"]
# print(f"Accuracy if classifying every class as {df_events['event_attributes'].describe()['top']}: "
#       f"{df_events['event_attributes'].describe()['freq'] / df_events['event_attributes'].describe()['count']:.2%}")
# print(f'Accuracy if classifying every class as a pass (in general) {df_pass.shape[0] / df_events.shape[0]:.2%}')

In [6]:
def create_train_test_video_ids(df_events):
  """Creating two arrays with which videos are either part of the train or the test data"""
  video_ids = df_events['video_id'].unique().tolist()
  amount_videos = len(video_ids)
  amount_train = round(amount_videos * TRAIN_SIZE)
  train_videos = video_ids[:amount_train]
  test_videos = video_ids[amount_train:]
  return video_ids, train_videos, test_videos

In [7]:
def create_X(video_ids):
  """Creating X array with all pictures for both train and test data"""
  X = list()
  for video_id in video_ids:
    current_video = np.load(f'{POSITIVES_FOLDER_PATH}{video_id}.npz')
    current_video = current_video.f.arr_0
    X.append(current_video)
  if NEGATIVES:
    for video_id in video_ids[:-10]:
      current_neg_video = np.load(f'{NEGATIVES_FOLDER_PATH}{video_id}.npz')
      current_neg_video = current_neg_video.f.arr_0
      X.append(current_video)
  X = np.concatenate(X, axis=0)
  return X

In [8]:
def train_test_X_split(df_events, train_videos, test_videos):
  """Splitting pictures in train and test data"""
  train_idx = df_events[df_events['video_id'].isin(train_videos)].index
  test_idx = df_events[df_events['video_id'].isin(test_videos)].index
  X_train = X[train_idx]
  X_test = X[test_idx]
  assert df_events.shape[0] == X_train.shape[0] + X_test.shape[0]
  return X_train, X_test, train_idx, test_idx

In [9]:
def train_test_y_split(df_events, train_idx, test_idx):
  """Creating DataFrame of labels for both train and test data using videos ids"""
  y_train = df_events[df_events.index.isin(train_idx)]['event'].to_frame()
  y_test = df_events[df_events.index.isin(test_idx)]['event'].to_frame()
  return y_train, y_test

In [10]:
def append_negatives(df_events, X, X_train, X_test, y_train, y_test):
  """ """
  len_positives = df_events.shape[0]
  len_X = X.shape[0]
  len_negatives = len_X - len_positives

  START_IDX_NEGATIVES = df_events.shape[0]
  START_IDX_NEG_TEST = round(START_IDX_NEGATIVES + len_negatives * TRAIN_SIZE)
  X_train_neg = X[START_IDX_NEGATIVES:START_IDX_NEG_TEST]
  X_test_neg = X[START_IDX_NEG_TEST:]
  assert len_negatives == X_train_neg.shape[0] + X_test_neg.shape[0]

  X_train = np.concatenate((X_train, X_train_neg), axis=0)
  X_test = np.concatenate((X_test, X_test_neg), axis=0)

  y_train = y_train.append(pd.DataFrame({'event':['negative'] * X_train_neg.shape[0]}))
  y_test = y_test.append(pd.DataFrame({'event':['negative'] * X_test_neg.shape[0]}))
  return X_train, X_test, y_train, y_test

In [11]:
def create_array_to_drop_for_undersampling_in_play(y_train):
  """Creating array of indexes to drop to perform undersampling in column 'play'"""
  play_indexes = y_train[y_train['event'] == 'play'].index
  assert y_train.value_counts()['play'] == len(play_indexes)
  play_amount = y_train.value_counts()['play']
  challenge_amount = y_train.value_counts()['challenge']
  drop_play_indexes = np.sort(np.random.choice(a=play_indexes, 
                                              size=play_amount - challenge_amount, 
                                              replace=False))
  assert len(drop_play_indexes) == play_amount - challenge_amount
  return drop_play_indexes

In [12]:
def perform_undersampling(X_train, y_train, drop_play_indexes):
  """Performing undersampling in 'play' category to balance it with 'challenge'"""
  y_train.drop(drop_play_indexes, axis=0, inplace=True)
  X_train = np.delete(X_train, drop_play_indexes, axis=0)
  return X_train, y_train

In [13]:
def perform_ohe(y_train, y_test):
  """Performing one-hot-encoding on our labels"""
  current_feat = ['event']
  encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
  encoder.fit(y_train[current_feat]) # Fit and transform data
  y_train[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_train[current_feat])
  y_train = y_train.drop(columns=current_feat)

  encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
  encoder.fit(y_test[current_feat]) # Fit and transform data
  y_test[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_test[current_feat])
  y_test = y_test.drop(columns=current_feat)
  return y_train, y_test

In [14]:
def del_X(X):
  """Deleting variable to free RAM"""
  del X

In [15]:
df_events = create_df_events()
video_ids, train_videos, test_videos = create_train_test_video_ids(df_events)
X = create_X(video_ids)
X_train, X_test, train_idx, test_idx = train_test_X_split(df_events, train_videos, test_videos)
y_train, y_test = train_test_y_split(df_events, train_idx, test_idx)
drop_play_indexes = create_array_to_drop_for_undersampling_in_play(y_train)
if NEGATIVES:
  X_train, X_test, y_train, y_test = append_negatives(df_events, X, X_train, X_test, y_train, y_test)
if UNDERSAMPLING:
  X_train, y_train = perform_undersampling(X_train, y_train, drop_play_indexes)
y_train, y_test = perform_ohe(y_train, y_test)
if FREE_RAM:
  del_X(X)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3985, 480, 640, 3), (1169, 480, 640, 3), (3985, 4), (1169, 4))

In [17]:
y_train.value_counts()

event_challenge  event_negative  event_play  event_throwin
0.0              0.0             1.0         0.0              2789
                 1.0             0.0         0.0               579
1.0              0.0             0.0         0.0               505
0.0              0.0             0.0         1.0               112
dtype: int64

In [4]:
# df = pd.read_csv(DRIVE_PATH)
# df_events = df[~df['event_attributes'].isna()].copy()
# df_events.reset_index(drop=True, inplace=True)

In [5]:
# video_ids = df_events['video_id'].unique().tolist()
# amount_videos = len(video_ids)
# amount_train = round(amount_videos * TRAIN_SIZE)
# train_videos = video_ids[:amount_train]
# test_videos = video_ids[amount_train:]

In [6]:
# video_ids

['1606b0e6_0',
 '1606b0e6_1',
 '35bd9041_0',
 '35bd9041_1',
 '3c993bd2_0',
 '3c993bd2_1',
 '407c5a9e_1',
 '4ffd5986_0',
 '9a97dae4_1',
 'cfbe2e94_0',
 'cfbe2e94_1',
 'ecf251d4_0']

In [7]:
# X = None
# for video_id in video_ids:
#   if X is None:
#     current_video = np.load(f'{POSITIVES_FOLDER_PATH}{video_id}.npz')
#     X = current_video.f.arr_0
#   else:
#     current_video = np.load(f'{POSITIVES_FOLDER_PATH}{video_id}.npz')
#     current_video = current_video.f.arr_0
#     X = np.concatenate([X, current_video], axis=0)

# len_positives = 0
# for i in X:
#   len_positives += i.shape[0]


# if NEGATIVES:
#   for video_id in video_ids[:-9]:
#     current_neg_video = np.load(f'{NEGATIVES_FOLDER_PATH}{video_id}.npz')
#     current_neg_video = current_neg_video.f.arr_0
#     X = np.concatenate([X, current_video], axis=0)

In [28]:
# X = list()
# for video_id in video_ids:
#   current_video = np.load(f'{POSITIVES_FOLDER_PATH}{video_id}.npz')
#   current_video = current_video.f.arr_0
#   X.append(current_video)
# if NEGATIVES:
#   for video_id in video_ids[:-10]:
#     current_neg_video = np.load(f'{NEGATIVES_FOLDER_PATH}{video_id}.npz')
#     current_neg_video = current_neg_video.f.arr_0
#     X.append(current_video)
# X = np.concatenate(X, axis=0)

In [None]:
# train_idx = df_events[df_events['video_id'].isin(train_videos)].index
# test_idx = df_events[df_events['video_id'].isin(test_videos)].index
# X_train = X[train_idx]
# X_test = X[test_idx]
# assert df_events.shape[0] == X_train.shape[0] + X_test.shape[0]

# y_train = df_events[df_events.index.isin(train_idx) ]['event'].to_frame()
# y_test = df_events[df_events.index.isin(test_idx) ]['event'].to_frame()



# len_positives = df_events.shape[0]
# len_X = X.shape[0]
# len_negatives = len_X - len_positives

# START_IDX_NEGATIVES = df_events.shape[0]
# START_IDX_NEG_TEST = round(START_IDX_NEGATIVES + len_negatives * TRAIN_SIZE)
# X_train_neg = X[START_IDX_NEGATIVES:START_IDX_NEG_TEST]
# X_test_neg = X[START_IDX_NEG_TEST:]
# assert len_negatives == X_train_neg.shape[0] + X_test_neg.shape[0]

# X_train = np.concatenate((X_train, X_train_neg), axis=0)
# X_test = np.concatenate((X_test, X_test_neg), axis=0)

# y_train = y_train.append(pd.DataFrame({'event':['negative'] * X_train_neg.shape[0]}))
# y_test = y_test.append(pd.DataFrame({'event':['negative'] * X_test_neg.shape[0]}))

In [29]:
# len_positives = df_events.shape[0]
# len_positives

4382

In [30]:
# len_X = X.shape[0]
# len_negatives = len_X - len_positives
# len_negatives

772

In [31]:
# df_events['event'].value_counts()

play         3586
challenge     624
throwin       172
Name: event, dtype: int64

In [32]:
# train_idx = df_events[df_events['video_id'].isin(train_videos)].index
# test_idx = df_events[df_events['video_id'].isin(test_videos)].index
# X_train = X[train_idx]
# X_test = X[test_idx]
# assert df_events.shape[0] == X_train.shape[0] + X_test.shape[0]

In [33]:
# y_train = df_events[df_events.index.isin(train_idx) ]['event'].to_frame()
# y_test = df_events[df_events.index.isin(test_idx) ]['event'].to_frame()

In [34]:
# X.shape

(5154, 480, 640, 3)

In [35]:
# X_train.shape, X_test.shape

((3406, 480, 640, 3), (976, 480, 640, 3))

In [36]:
# START_IDX_NEGATIVES = df_events.shape[0]
# START_IDX_NEGATIVES

4382

In [37]:
# START_IDX_NEG_TEST = round(START_IDX_NEGATIVES + len_negatives * TRAIN_SIZE)
# START_IDX_NEG_TEST

4961

In [38]:
# X_train_neg = X[START_IDX_NEGATIVES:START_IDX_NEG_TEST]
# X_test_neg = X[START_IDX_NEG_TEST:]

In [58]:
# X_train_neg.shape[0], X_test_neg.shape[0]

(579, 193)

In [39]:
# assert len_negatives == X_train_neg.shape[0] + X_test_neg.shape[0]

In [40]:
# del X

In [41]:
# X_train = np.concatenate((X_train, X_train_neg), axis=0)
# X_train.shape

(3985, 480, 640, 3)

In [42]:
# X_test = np.concatenate((X_test, X_test_neg), axis=0)
# X_test.shape

(1169, 480, 640, 3)

In [48]:
# 772*0.75

579.0

In [47]:
# y_train

Unnamed: 0,event
0,challenge
1,challenge
2,throwin
3,play
4,play
...,...
3401,challenge
3402,play
3403,challenge
3404,challenge


In [59]:
# y_train = y_train.append(pd.DataFrame({'event':['negative'] * X_train_neg.shape[0]}))
# y_test = y_test.append(pd.DataFrame({'event':['negative'] * X_test_neg.shape[0]}))

In [None]:
input_shape = X_train.shape[1], X_train.shape[2], X_train.shape[3]

model = Sequential([
    Rescaling(1/255, 
              input_shape=input_shape),   
    Conv2D(96, 
           11, 
           strides=4, 
           activation='relu'),
    MaxPooling2D(pool_size=3, 
                 strides=2),
    Conv2D(256, 
           5, 
           activation='relu'),
    MaxPooling2D(pool_size=3, 
                 strides=2),
    Conv2D(384, 
           3, 
           activation='relu'),
    Conv2D(384, 
           3, 
           activation='relu'),
    Conv2D(256, 
           3, 
           activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(256, 
          activation='relu'),
    Dense(256, 
          activation='relu'),
    Dense(3, 
          activation='softmax')
])

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics='accuracy',
              )

In [None]:
callback = EarlyStopping(monitor='val_loss',
                         patience=3,
                         restore_best_weights=True
                         )

In [None]:
EPOCHS = 100
model.fit(X_train, 
          y_train, 
          validation_split=0.2, 
          batch_size=16, 
          epochs=EPOCHS,
          callbacks=[callback])

In [None]:
INDEX_ACCURACY = 1
print(f'\nAccuracy on the test set: {model.evaluate(X_test, y_test)[INDEX_ACCURACY]:.4f}')

In [None]:
y_pred = model.predict(X_test)
np.set_printoptions(threshold=sys.maxsize)
print(y_pred)