Whole pipeline working with balanced data and all videos:

In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Rescaling, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Defining constant variables
DRIVE_PATH = '/content/drive/MyDrive/Bundesliga/train.csv'
COMPRESSED_FOLDER_PATH = '/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/'
TRAIN_SIZE = 0.75

# Reading DataFrame and creating one with only relevant events
df = pd.read_csv(DRIVE_PATH)
df_events = df[~df['event_attributes'].isna()].copy()

df_pass = df[df['event_attributes'].str[:6] == "['pass"]

print(f"Accuracy if classifying every class as {df_events['event_attributes'].describe()['top']}: "
      f"{df_events['event_attributes'].describe()['freq'] / df_events['event_attributes'].describe()['count']:.2%}")
print(f'Accuracy if classifying every class as a pass (in general) {df_pass.shape[0] / df_events.shape[0]:.2%}')

# Creating two arrays with which videos are either part of the train or the test data
df_events.reset_index(drop=True, inplace=True)
video_ids = df_events['video_id'].unique().tolist()
amount_videos = len(video_ids)
amount_train = round(amount_videos * TRAIN_SIZE)
train_videos = video_ids[:amount_train]
test_videos = video_ids[amount_train:]

# Creating X array with all pictures for both train and test data
X = list()
for video_id in video_ids:
  current_video = np.load(f'{COMPRESSED_FOLDER_PATH}{video_id}.npz')
  current_video = current_video.f.arr_0
  X.append(current_video)
X = np.concatenate(X, axis=0)

# Splitting pictures in train and test data
train_idx = df_events[df_events['video_id'].isin(train_videos)].index
test_idx = df_events[df_events['video_id'].isin(test_videos)].index
X_train = X[train_idx]
X_test = X[test_idx]
assert df_events.shape[0] == X_train.shape[0] + X_test.shape[0]

# Creating DataFrame of labels for both train and test data using videos ids
y_train = df_events[df_events.index.isin(train_idx) ]['event'].to_frame()
y_test = df_events[df_events.index.isin(test_idx) ]['event'].to_frame()

# Performing undersampling in 'play' category to balance it with 'challenge'
play_indexes = y_train[y_train['event'] == 'play'].index
assert y_train.value_counts()['play'] == len(play_indexes)
play_amount = y_train.value_counts()['play']
challenge_amount = y_train.value_counts()['challenge']
drop_play_indexes = np.sort(np.random.choice(a=play_indexes, 
                                             size=play_amount - challenge_amount, 
                                             replace=False))
assert len(drop_play_indexes) == play_amount - challenge_amount

# Building final training data
y_train.drop(drop_play_indexes, axis=0, inplace=True)
X_train = np.delete(X_train, drop_play_indexes, axis=0)

# Performing one-hot-encoding on our labels
current_feat = ['event']
for y in (y_train, y_test):
  encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
  encoder.fit(y[current_feat]) # Fit and transform data
  y[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y[current_feat])
  y = y.drop(columns=current_feat)

# Deleting variable to free RAM
del X

# input_shape = X_train.shape[1], X_train.shape[2], X_train.shape[3]

# model = Sequential([
#     Rescaling(1/255, 
#               input_shape=input_shape),   
#     Conv2D(96, 
#            11, 
#            strides=4, 
#            activation='relu'),
#     MaxPooling2D(pool_size=3, 
#                  strides=2),
#     Conv2D(256, 
#            5, 
#            activation='relu'),
#     MaxPooling2D(pool_size=3, 
#                  strides=2),
#     Conv2D(384, 
#            3, 
#            activation='relu'),
#     Conv2D(384, 
#            3, 
#            activation='relu'),
#     Conv2D(256, 
#            3, 
#            activation='relu'),
#     MaxPooling2D(),
#     Flatten(),
#     Dense(256, 
#           activation='relu'),
#     Dense(256, 
#           activation='relu'),
#     Dense(3, 
#           activation='softmax')
# ])

# model.compile(loss='categorical_crossentropy',
#               optimizer='adam',
#               metrics='accuracy',
#               )

# callback = EarlyStopping(monitor='val_loss',
#                          patience=3,
#                          restore_best_weights=True
#                          )

# EPOCHS = 100
# model.fit(X_train, 
#           y_train, 
#           validation_split=0.2, 
#           batch_size=16, 
#           epochs=EPOCHS,
#           callbacks=[callback])

# INDEX_ACCURACY = 1
# print(f'\nAccuracy on the test set: {model.evaluate(X_test, y_test)[INDEX_ACCURACY]:.4f}')

# y_pred = model.predict(X_test)
# np.set_printoptions(threshold=sys.maxsize)
# print(y_pred)

Accuracy if classifying every class as ['pass', 'openplay']: 76.15%
Accuracy if classifying every class as a pass (in general) 82.66%


In [5]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1122, 480, 640, 3), (1122, 4), (976, 480, 640, 3), (976, 4))

Same procedure as pipeline above:

In [None]:
import os
import pandas as pd
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Rescaling, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
! ls '/content/drive/'

In [None]:
os.chdir('/content/drive/MyDrive/Bundesliga')

In [None]:
! pwd

In [None]:
# ! unzip dfl-bundesliga-data-shootout.zip

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Bundesliga/train.csv')

In [None]:
# df['time_ms'] = round(df['time'] * 1000).astype(int)

In [None]:
display(df.head())
df.shape

In [None]:
df_events = df[~df['event_attributes'].isna()].copy()

In [None]:
# df_events['frame_id'] = df_events['video_id'] + df_events['time_ms'].astype(str)

In [None]:
# # Reordering columns
# df_events = df_events[['video_id', 'frame_id', 'time', 
#                        'time_ms', 'event', 'event_attributes']]

In [None]:
display(df_events.head())
df_events.shape

Building very basic baseline model:

In [None]:
df_pass = df[df['event_attributes'].str[:6] == "['pass"]

In [None]:
display(df_pass.head())
df_pass.shape

In [None]:
df_events['event_attributes'].value_counts()

In [None]:
print(f"Accuracy if classifying every class as {df_events['event_attributes'].describe()['top']}: "
      f"{df_events['event_attributes'].describe()['freq'] / df_events['event_attributes'].describe()['count']:.2%}")

In [None]:
print(f'Accuracy if classifying every class as a pass (in general) {df_pass.shape[0] / df_events.shape[0]:.2%}')

In [None]:
df_events['event'].value_counts()

In [None]:
df_video1 = df_events[df_events['video_id'] == '1606b0e6_0']
df_video1

In [None]:
df_video2 = df_events[df_events['video_id'] == '1606b0e6_1']
df_video2

In [None]:
y_train_val = df_video1['event']
y_train_val.to_frame()

In [None]:
y_test = df_video2['event']
y_test.to_frame()

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

current_feat = ['event']

print('Shape of y_test before transformation: ', y_test.shape)
y_test = y_test.to_frame()

# Fit and transform data
encoder.fit(y_test[current_feat])
y_test[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_test[current_feat])
y_test = y_test.drop(columns=current_feat)

print('Shape of y_test after transformation: ', y_test.shape)

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

current_feat = ['event']

print('Shape of y_train_val before transformation: ', y_train_val.shape)
y_train_val = y_train_val.to_frame()

# Fit and transform data
encoder.fit(y_train_val[current_feat])
y_train_val[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_train_val[current_feat])
y_train_val = y_train_val.drop(columns=current_feat)

print('Shape of y_train_val after transformation: ', y_train_val.shape)

Files without compressing:

In [None]:
# X_train = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/1606b0e6_0.npy')
# X_train2 = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/1606b0e6_1.npy')
# X_train3 = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/35bd9041_0.npy')
# X_train4 = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/35bd9041_1.npy')
# X_train5 = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/3c993bd2_0.npy')

In [None]:
# X_train.shape
# X_train2.shape, X_train3.shape, X_train4.shape, X_train5.shape

In [None]:
# X_train.max()
# X_train2.max(), X_train3.max(), X_train4.max(), X_train5.max()

# Working with compressed pictures:

## Training the model with just the first video and original (unbalanced) data:

In [None]:
X_train_val = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/1606b0e6_0.npz')

In [None]:
X_test = np.load('/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/1606b0e6_1.npz')
X_test = X_test.f.arr_0

In [None]:
X_train_val = X_train_val.f.arr_0

In [None]:
X_train_val.f.arr_0.shape

In [None]:
X_test.shape

In [None]:
input_shape = X_train_val.shape[1], X_train_val.shape[2], X_train_val.shape[3]

model = Sequential([
    Rescaling(1/255, 
              input_shape=input_shape),   
    Conv2D(96, 
           11, 
           strides=4, 
           activation='relu'),
    MaxPooling2D(pool_size=3, 
                 strides=2),
    Conv2D(256, 
           5, 
           activation='relu'),
    MaxPooling2D(pool_size=3, 
                 strides=2),
    Conv2D(384, 
           3, 
           activation='relu'),
    Conv2D(384, 
           3, 
           activation='relu'),
    Conv2D(256, 
           3, 
           activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(256, 
          activation='relu'),
    Dense(256, 
          activation='relu'),
    Dense(3, 
          activation='softmax')
])

In [None]:
model.input_shape, model.output_shape

In [None]:
X_train_val.shape, y_train_val.shape

In [None]:
y_train_val.value_counts()

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics='accuracy',
              )

In [None]:
callback = EarlyStopping(monitor='val_loss',
                         patience=5,
                         restore_best_weights=True
                         )

In [None]:
EPOCHS = 100
model.fit(X_train_val, 
          y_train_val, 
          validation_split=0.2, 
          batch_size=16, 
          epochs=EPOCHS,
          callbacks=[callback])

In [None]:
INDEX_ACCURACY = 1
print(f'\nAccuracy on the test set: {model.evaluate(X_test, y_test)[INDEX_ACCURACY]:.4f}')

In [None]:
y_pred = model.predict(X_test)

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
y_pred

## Training the model with all videos and balanced (undersampling) data:

In [None]:
df_events

In [None]:
df_events.reset_index(drop=True, inplace=True)
df_events

In [None]:
video_ids = df_events['video_id'].unique().tolist()
train_videos = video_ids[:9]
test_videos = video_ids[9:]

In [None]:
train_videos

In [None]:
test_videos

In [None]:
X = list()
for video_id in video_ids:
  current_video = np.load(f'/content/drive/MyDrive/Bundesliga/PicsFromVids/CompressedPic/{video_id}.npz')
  current_video = current_video.f.arr_0
  X.append(current_video)

In [None]:
X = np.concatenate(X, axis=0)

In [None]:
X.shape

In [None]:
train_idx = df_events[df_events['video_id'].isin(train_videos)].index
test_idx = df_events[df_events['video_id'].isin(test_videos)].index

In [None]:
train_idx

In [None]:
X_train = X[train_idx]
X_test = X[test_idx]

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
assert df_events.shape[0] == X_train.shape[0] + X_test.shape[0]

In [None]:
y_train = df_events[df_events.index.isin(train_idx) ]['event'].to_frame()
y_test = df_events[df_events.index.isin(test_idx) ]['event'].to_frame()

In [None]:
y_train.value_counts()

In [None]:
play_indexes = y_train[y_train['event'] == 'play'].index

In [None]:
assert y_train.value_counts()['play'] == len(play_indexes)

In [None]:
play_amount = y_train.value_counts()['play']
challenge_amount = y_train.value_counts()['challenge']
challenge_amount

In [None]:
drop_play_indexes = np.sort(np.random.choice(a=play_indexes, 
                                             size=play_amount - challenge_amount, 
                                             replace=False))
drop_play_indexes[-10:]

In [None]:
assert len(drop_play_indexes) == play_amount - challenge_amount

Deleting rows to perform and undersampling in the class 'play':

In [None]:
y_train.drop(drop_play_indexes, axis=0, inplace=True)
y_train.shape

In [None]:
y_train.value_counts()

In [None]:
X_train.shape

Deleting the same rows from X_train:

In [None]:
X_train = np.delete(X_train, drop_play_indexes, axis=0)

In [None]:
X_train.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

current_feat = ['event']

print(f'Shape of y_train before transformation: ', y_train.shape)

# Fit and transform data
encoder.fit(y_train[current_feat])
y_train[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_train[current_feat])
y_train = y_train.drop(columns=current_feat)

print(f'Shape of y_train after transformation: ', y_train.shape)

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')

current_feat = ['event']

print(f'Shape of y_train before transformation: ', y_test.shape)

# Fit and transform data
encoder.fit(y_test[current_feat])
y_test[encoder.get_feature_names_out(list(current_feat))] = encoder.transform(y_test[current_feat])
y_test = y_test.drop(columns=current_feat)

print(f'Shape of y_test after transformation: ', y_test.shape)

In [None]:
input_shape = X_train.shape[1], X_train.shape[2], X_train.shape[3]

model = Sequential([
    Rescaling(1/255, 
              input_shape=input_shape),   
    Conv2D(96, 
           11, 
           strides=4, 
           activation='relu'),
    MaxPooling2D(pool_size=3, 
                 strides=2),
    Conv2D(256, 
           5, 
           activation='relu'),
    MaxPooling2D(pool_size=3, 
                 strides=2),
    Conv2D(384, 
           3, 
           activation='relu'),
    Conv2D(384, 
           3, 
           activation='relu'),
    Conv2D(256, 
           3, 
           activation='relu'),
    MaxPooling2D(),
    Flatten(),
    Dense(256, 
          activation='relu'),
    Dense(256, 
          activation='relu'),
    Dense(3, 
          activation='softmax')
])

In [None]:
model.input_shape, model.output_shape

In [None]:
X_train.shape, y_train.shape

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics='accuracy',
              )

In [None]:
callback = EarlyStopping(monitor='val_loss',
                         patience=3,
                         restore_best_weights=True
                         )

In [None]:
EPOCHS = 100
model.fit(X_train, 
          y_train, 
          validation_split=0.2, 
          batch_size=16, 
          epochs=EPOCHS,
          callbacks=[callback])

In [None]:
INDEX_ACCURACY = 1
print(f'\nAccuracy on the test set: {model.evaluate(X_test, y_test)[INDEX_ACCURACY]:.4f}')

In the test set the data is unbalanced and that's why we see such a difference between the validation and the test set.

In [None]:
y_pred = model.predict(X_test)

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
y_pred