# 1. Import

In [35]:
import pickle
import pandas as pd
from tqdm import tqdm
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split 

In [36]:
file_path_0 = "../multisports_GT.pkl"
file_path_1 = "../multisports_half_test.pkl"
file_path_2 = "../multisports_test.pkl"

data_0 = pickle.load(open(file_path_0, "rb"))
data_1 = pickle.load(open(file_path_1, "rb"))
data_2 = pickle.load(open(file_path_2, "rb"))

In [37]:
print(data_0.keys())
print(data_1.keys())
print(data_2.keys())

dict_keys(['labels', 'train_videos', 'test_videos', 'nframes', 'resolution', 'gttubes'])
dict_keys(['test_videos'])
dict_keys(['nframes', 'test_videos', 'resolution'])


In [38]:
df_label = pd.DataFrame(data_0['labels'],columns=['label'])
df_label['action'] = df_label['label'].apply(lambda x: ' '.join(x.split(' ')[1:]))
df_label['category'] = df_label['label'].apply(lambda x: x.split(' ')[0])
df_label['category'] = df_label['category'].str.replace('aerobic','aerobic_gymnastics')
df_label = df_label.reset_index(drop=False,names='action_idx')

df_label

Unnamed: 0,action_idx,label,action,category
0,0,aerobic push up,push up,aerobic_gymnastics
1,1,aerobic explosive push up,explosive push up,aerobic_gymnastics
2,2,aerobic explosive support,explosive support,aerobic_gymnastics
3,3,aerobic leg circle,leg circle,aerobic_gymnastics
4,4,aerobic helicopter,helicopter,aerobic_gymnastics
...,...,...,...,...
61,61,basketball sag,sag,basketball
62,62,basketball screen,screen,basketball
63,63,basketball pass-inbound,pass-inbound,basketball
64,64,basketball save,save,basketball


In [39]:
df_tmp_1 = pd.DataFrame(data_0['train_videos'][0],columns=['original_video_name'])
df_tmp_2 = pd.DataFrame(data_0['test_videos'][0],columns=['original_video_name'])

df_tmp_1['video'] = df_tmp_1['original_video_name'].apply(lambda x: x.split('/')[1])

# df_tmp_1['category'] = df_tmp_1['original_video_name'].apply(lambda x: 'aerobic' if x.split('/')[0] == 'aerobic_gymnastics' else x.split('/')[0])
df_tmp_1['category'] = df_tmp_1['original_video_name'].apply(lambda x: x.split('/')[0])
df_tmp_1['split'] = 'train'

df_tmp_2['video'] = df_tmp_2['original_video_name'].apply(lambda x: x.split('/')[1])

# df_tmp_2['category'] = df_tmp_2['original_video_name'].apply(lambda x: 'aerobic' if x.split('/')[0] == 'aerobic_gymnastics' else x.split('/')[0])
df_tmp_2['category'] = df_tmp_2['original_video_name'].apply(lambda x: x.split('/')[0])
df_tmp_2['split'] = 'test'

df_tmp_3 = pd.DataFrame({'original_video_name' : data_0['nframes'].keys(),
              'nframes': data_0['nframes'].values()})

df_video = pd.concat([df_tmp_1, df_tmp_2]).reset_index(drop=True)

df_video = pd.merge(df_video, df_tmp_3, how='left', on='original_video_name')

df_video['resolution'] = [(720,1280) for _ in range(len(df_video))]

df_video = df_video.reindex(columns = ['video','category','nframes','resolution','split','original_video_name'])

df_video

Unnamed: 0,video,category,nframes,resolution,split,original_video_name
0,v_aqMgwPExjD0_c001,aerobic_gymnastics,700,"(720, 1280)",train,aerobic_gymnastics/v_aqMgwPExjD0_c001
1,v_yaKOumdXwbU_c019,aerobic_gymnastics,775,"(720, 1280)",train,aerobic_gymnastics/v_yaKOumdXwbU_c019
2,v_NzCihhjR_NE_c113,aerobic_gymnastics,777,"(720, 1280)",train,aerobic_gymnastics/v_NzCihhjR_NE_c113
3,v_NzCihhjR_NE_c071,aerobic_gymnastics,451,"(720, 1280)",train,aerobic_gymnastics/v_NzCihhjR_NE_c071
4,v_NzCihhjR_NE_c031,aerobic_gymnastics,827,"(720, 1280)",train,aerobic_gymnastics/v_NzCihhjR_NE_c031
...,...,...,...,...,...,...
2124,v_I-tZTTBt3rc_c001,basketball,250,"(720, 1280)",test,basketball/v_I-tZTTBt3rc_c001
2125,v_I-tZTTBt3rc_c002,basketball,775,"(720, 1280)",test,basketball/v_I-tZTTBt3rc_c002
2126,v_jZj2UkglB9E_c006,basketball,351,"(720, 1280)",test,basketball/v_jZj2UkglB9E_c006
2127,v_kguAFIlDI4E_c012,basketball,575,"(720, 1280)",test,basketball/v_kguAFIlDI4E_c012


In [40]:
samples = {'original_video_name' : [],    
            'action_idx' : [],
            'action_start_frame' : [],
            'action_count_frame' : []}

for video in data_0['gttubes']:
    # print(video)
    for cls in data_0['gttubes'][video]:
        # print(cls)
        # print(len(data_2['gttubes'][video][cls]))
        for action in data_0['gttubes'][video][cls]:
            # print(f' len: {len(action)}')
            # print(int(action[0][0]))
            samples['original_video_name'].append(video)
            samples['action_idx'].append(cls)
            samples['action_count_frame'].append(len(action))
            samples['action_start_frame'].append(int(action[0][0]))

df_samples = pd.DataFrame(samples)

df_samples = pd.merge(df_samples, df_label,how='left', on='action_idx')
df_samples = df_samples.drop(columns=['label'])
df_samples = pd.merge(df_samples, df_video[['original_video_name','split']], how='left', on='original_video_name')

df_samples[['action_start_frame','action_count_frame']].describe()

Unnamed: 0,action_start_frame,action_count_frame
count,24999.0,24999.0
mean,359.24201,23.897916
std,339.755521,23.511188
min,1.0,2.0
25%,129.5,11.0
50%,267.0,17.0
75%,497.0,26.0
max,3600.0,438.0


In [41]:
df_samples.head(5)

Unnamed: 0,original_video_name,action_idx,action_start_frame,action_count_frame,action,category,split
0,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,377,22,bent leg(s) jump,aerobic_gymnastics,train
1,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,399,26,bent leg(s) jump,aerobic_gymnastics,train
2,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,378,21,bent leg(s) jump,aerobic_gymnastics,train
3,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,399,25,bent leg(s) jump,aerobic_gymnastics,train
4,aerobic_gymnastics/v_aqMgwPExjD0_c001,9,561,47,illusion,aerobic_gymnastics,train


In [42]:
df_samples[(df_samples['category'] == 'basketball') & (df_samples['split'] == 'train')]['action'].value_counts()

action
pass                       1369
dribble                    1010
interfere shot              362
2-point shot                327
screen                      239
drive                       236
pick-and-roll defensive     221
3-point shot                169
sag                         168
defensive rebound           163
pass-inbound                114
offensive rebound            46
free throw                   34
pass steal                   29
block                        16
dribble steal                16
jump ball                    10
save                          3
Name: count, dtype: int64

In [43]:
df_samples[(df_samples['category'] == 'basketball') & (df_samples['split'] == 'test')]['action'].value_counts()

action
pass                       541
dribble                    379
2-point shot               150
interfere shot             126
drive                       88
screen                      74
pick-and-roll defensive     64
defensive rebound           63
sag                         57
pass-inbound                46
3-point shot                41
pass steal                  17
free throw                  16
offensive rebound           16
block                       14
dribble steal               13
jump ball                    8
save                         2
Name: count, dtype: int64

# 2. Data preparation

In [45]:
df_basketball = pd.read_csv("./metadata_basketball.csv")

df_basketball

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video
0,48_0001_v_-6Os86HzwCs_c001.mp4,48,basketball,6,False,v_-6Os86HzwCs_c001.mp4
1,48_0002_v_-6Os86HzwCs_c001.mp4,48,basketball,4,False,v_-6Os86HzwCs_c001.mp4
2,48_0003_v_-6Os86HzwCs_c001.mp4,48,basketball,5,False,v_-6Os86HzwCs_c001.mp4
3,48_0004_v_-6Os86HzwCs_c001.mp4,48,basketball,6,False,v_-6Os86HzwCs_c001.mp4
4,50_0001_v_-6Os86HzwCs_c001.mp4,50,basketball,36,False,v_-6Os86HzwCs_c001.mp4
...,...,...,...,...,...,...
6242,57_0001_v_V4tIg2zSbWU_c010.mp4,57,basketball,6,False,v_V4tIg2zSbWU_c010.mp4
6243,60_0001_v_V4tIg2zSbWU_c010.mp4,60,basketball,16,False,v_V4tIg2zSbWU_c010.mp4
6244,60_0002_v_V4tIg2zSbWU_c010.mp4,60,basketball,11,False,v_V4tIg2zSbWU_c010.mp4
6245,52_0001_v_V4tIg2zSbWU_c010.mp4,52,basketball,12,True,v_V4tIg2zSbWU_c010.mp4


In [46]:
df_basketball['resize'].value_counts()

resize
False    5074
True     1173
Name: count, dtype: int64

In [47]:
df_football = pd.read_csv("./metadata_football.csv")
print(len(df_basketball) + len(df_football))

df_football 


14544


Unnamed: 0,video_name,action_idx,category,frames,resize,source_video
0,39_0001_v_-hhDbvY5aAM_c001.mp4,39,football,12,False,v_-hhDbvY5aAM_c001.mp4
1,39_0002_v_-hhDbvY5aAM_c001.mp4,39,football,13,False,v_-hhDbvY5aAM_c001.mp4
2,39_0003_v_-hhDbvY5aAM_c001.mp4,39,football,8,False,v_-hhDbvY5aAM_c001.mp4
3,39_0004_v_-hhDbvY5aAM_c001.mp4,39,football,11,False,v_-hhDbvY5aAM_c001.mp4
4,36_0001_v_-hhDbvY5aAM_c001.mp4,36,football,10,False,v_-hhDbvY5aAM_c001.mp4
...,...,...,...,...,...,...
8292,39_0005_v_ZcFbmdCxDhQ_c010.mp4,39,football,15,False,v_ZcFbmdCxDhQ_c010.mp4
8293,39_0006_v_ZcFbmdCxDhQ_c010.mp4,39,football,12,False,v_ZcFbmdCxDhQ_c010.mp4
8294,39_0007_v_ZcFbmdCxDhQ_c010.mp4,39,football,12,False,v_ZcFbmdCxDhQ_c010.mp4
8295,43_0001_v_ZcFbmdCxDhQ_c010.mp4,43,football,14,False,v_ZcFbmdCxDhQ_c010.mp4


In [48]:
df_football['resize'].value_counts()

resize
False    7721
True      576
Name: count, dtype: int64

In [49]:
df_volleyball = pd.read_csv("./metadata_volleyball.csv")
print(len(df_basketball) + len(df_football) + len(df_volleyball))

df_volleyball

19387


Unnamed: 0,video_name,action_idx,category,frames,resize,source_video
0,21_0001_v_0kUtTtmLaJA_c001.mp4,21,volleyball,32,False,v_0kUtTtmLaJA_c001.mp4
1,23_0001_v_0kUtTtmLaJA_c001.mp4,23,volleyball,10,False,v_0kUtTtmLaJA_c001.mp4
2,26_0001_v_0kUtTtmLaJA_c001.mp4,26,volleyball,22,False,v_0kUtTtmLaJA_c001.mp4
3,30_0001_v_0kUtTtmLaJA_c001.mp4,30,volleyball,18,False,v_0kUtTtmLaJA_c001.mp4
4,22_0001_v_0kUtTtmLaJA_c001.mp4,22,volleyball,15,False,v_0kUtTtmLaJA_c001.mp4
...,...,...,...,...,...,...
4838,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4
4839,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4
4840,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4
4841,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4


In [50]:
df_volleyball['resize'].value_counts()

resize
False    4429
True      414
Name: count, dtype: int64

In [51]:
df_aerobic_gymnastics = pd.read_csv("./metadata_aerobic_gymnastics.csv")
print(len(df_basketball) + len(df_football) + len(df_volleyball) + len(df_aerobic_gymnastics))

df_aerobic_gymnastics

24999


Unnamed: 0,video_name,action_idx,category,frames,resize,source_video
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4
...,...,...,...,...,...,...
5607,09_0001_v__wAgwttPYaQ_c020.mp4,9,aerobic_gymnastics,49,True,v__wAgwttPYaQ_c020.mp4
5608,09_0002_v__wAgwttPYaQ_c020.mp4,9,aerobic_gymnastics,44,True,v__wAgwttPYaQ_c020.mp4
5609,09_0003_v__wAgwttPYaQ_c020.mp4,9,aerobic_gymnastics,47,True,v__wAgwttPYaQ_c020.mp4
5610,09_0004_v__wAgwttPYaQ_c020.mp4,9,aerobic_gymnastics,47,True,v__wAgwttPYaQ_c020.mp4


In [53]:
df_sports = pd.concat([df_aerobic_gymnastics,df_basketball,df_football,df_volleyball])

df_sports

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4
...,...,...,...,...,...,...
4838,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4
4839,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4
4840,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4
4841,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4


In [54]:
len(df_sports['source_video'].drop_duplicates())

2129

In [55]:
df_samples['source_video'] = df_samples['original_video_name'].apply(lambda x: x.split('/')[1] + '.mp4')
df_samples

Unnamed: 0,original_video_name,action_idx,action_start_frame,action_count_frame,action,category,split,source_video
0,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,377,22,bent leg(s) jump,aerobic_gymnastics,train,v_aqMgwPExjD0_c001.mp4
1,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,399,26,bent leg(s) jump,aerobic_gymnastics,train,v_aqMgwPExjD0_c001.mp4
2,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,378,21,bent leg(s) jump,aerobic_gymnastics,train,v_aqMgwPExjD0_c001.mp4
3,aerobic_gymnastics/v_aqMgwPExjD0_c001,10,399,25,bent leg(s) jump,aerobic_gymnastics,train,v_aqMgwPExjD0_c001.mp4
4,aerobic_gymnastics/v_aqMgwPExjD0_c001,9,561,47,illusion,aerobic_gymnastics,train,v_aqMgwPExjD0_c001.mp4
...,...,...,...,...,...,...,...,...
24994,basketball/v_K5PFNpiP5aY_c001,50,43,143,dribble,basketball,train,v_K5PFNpiP5aY_c001.mp4
24995,basketball/v_K5PFNpiP5aY_c001,51,284,19,3-point shot,basketball,train,v_K5PFNpiP5aY_c001.mp4
24996,basketball/v_K5PFNpiP5aY_c001,62,261,38,screen,basketball,train,v_K5PFNpiP5aY_c001.mp4
24997,basketball/v_K5PFNpiP5aY_c001,59,295,19,interfere shot,basketball,train,v_K5PFNpiP5aY_c001.mp4


In [56]:
df_samples.iloc[:,6:].drop_duplicates()

Unnamed: 0,split,source_video
0,train,v_aqMgwPExjD0_c001.mp4
6,train,v_yaKOumdXwbU_c019.mp4
21,train,v_NzCihhjR_NE_c113.mp4
25,train,v_NzCihhjR_NE_c071.mp4
29,train,v_NzCihhjR_NE_c031.mp4
...,...,...
24934,train,v_EGtCLk5BksU_c601.mp4
24943,train,v_FH9KE8FSmig_c001.mp4
24966,train,v_rYquruzj9hc_c004.mp4
24983,train,v_ERsBxGv2JxY_c006.mp4


In [57]:
df_sports = pd.merge(df_sports, df_samples.iloc[:,6:].drop_duplicates(), how='left', on='source_video')
df_sports

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video,split
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4,train
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4,train
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4,train
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4,train
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4,train
...,...,...,...,...,...,...,...
24994,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train
24995,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train
24996,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4,train
24997,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4,train


In [58]:
df_sports = pd.merge(df_sports, df_label[['action_idx','action']], how='left', on='action_idx')
df_sports

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video,split,action
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4,train,support
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4,train,support
...,...,...,...,...,...,...,...,...
24994,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block
24995,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block
24996,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4,train,first pass
24997,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4,train,second pass


In [59]:
df_sports.to_csv('../metadata_sports.csv',index=False)

In [60]:
df_sports = pd.read_csv("../metadata_sports.csv")

In [61]:
df_sports

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video,split,action
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4,train,support
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4,train,support
...,...,...,...,...,...,...,...,...
24994,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block
24995,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block
24996,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4,train,first pass
24997,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4,train,second pass


# 3. Deleting videos with errors

In [63]:
df_sports['error'] = False

errors =  {
            "basketball":

                {
                    "train": [
                                "48_0001_v_BdD9xu0E2H4_c004.mp4",
                                "48_0002_v_5ekaksddqrc_c004.mp4",
                                "48_0002_v_A4OJhlI6hgc_c005.mp4",
                                "48_0004_v_JbiWR0AWKys_c001.mp4",
                                "50_0002_v_A4OJhlI6hgc_c601.mp4",
                                "50_0004_v_JbiWR0AWKys_c005.mp4",
                                # error frames
                                "48_0003_v_JbiWR0AWKys_c005.mp4",
                                "48_0007_v_rYquruzj9hc_c001.mp4"
                            ],

                    "test": [
                                "63_0001_v_SHFVKZ6HJc8_c007.mp4"
                            ],

                    "val": [
                                "48_0001_v_SHFVKZ6HJc8_c600.mp4",
                                "48_0004_v_SHFVKZ6HJc8_c012.mp4",
                                "57_0001_v_kguAFIlDI4E_c011.mp4",
                                "63_0001_v_kguAFIlDI4E_c009.mp4",
                                # error frames
                                "48_0003_v_kguAFIlDI4E_c011.mp4"
                            ],
                },

            "volleyball" : 

                {
                    "train": [
                                "24_0002_v_pY29uMQxscs_c010.mp4",
                                "25_0001_v_UoIX44n61NY_c004.mp4",
                                # error frames
                                "22_0001_v_4-EmEtrturE_c009.mp4"
                            ],

                    "test": [
                                "21_0001_v_Fsbu9m-6xQ4_c009.mp4",
                                "21_0001_v_Fsbu9m-6xQ4_c010.mp4"
                            ],

                    "val": [    
                                "21_0001_v_Fsbu9m-6xQ4_c011.mp4"
                            ],
                },
            
            
            "aerobic_gymnastics" : 

                {
                    "train": [
                                "04_0006_v_NzCihhjR_NE_c090.mp4",
                                "08_0003_v_NzCihhjR_NE_c022.mp4",
                                "10_0005_v_NzCihhjR_NE_c073.mp4",
                                "05_0002_v_IzrY6nmu9H4_c015.mp4",
                                "08_0005_v_NzCihhjR_NE_c073.mp4"
                            ],

                    "test": [],

                    "val": [
                                "05_0003_v_HIow7_XktlQ_c010.mp4"
                            ],
                },

            
            "football" : 
                {
                    "train": [
                                "34_0001_v_UAbX7wld9vg_c001.mp4",
                                "34_0002_v_UAbX7wld9vg_c001.mp4",
                                "35_0001_v_gQNyhv8y0QY_c602.mp4",
                                "35_0002_v_4f6xLy4pop0_c015.mp4",
                                "35_0002_v_5uBLdpt5-oU_c603.mp4",
                                "35_0002_v_ervkVzoFJ5w_c010.mp4",
                                "35_0002_v_iIxMOsCGH58_c006.mp4",
                                "35_0003_v_kEyqyYJhIIg_c005.mp4",
                                "35_0004_v_aQawj_MHBu0_c601.mp4",
                                "35_0004_v_GGhhbMOp6yY_c009.mp4",
                                "35_0005_v_dw7LOz17Omg_c070.mp4",
                                "35_0008_v_aQawj_MHBu0_c005.mp4",
                                "35_0011_v_FFdQL8Ljrag_c008.mp4",
                                "36_0002_v_KxBv_sS443w_c002.mp4",
                                "37_0001_v_aQawj_MHBu0_c007.mp4",
                                "37_0001_v_IW0q3RA_O7g_c006.mp4",
                                "39_0001_v_9wLBkU3fs0k_c054.mp4",
                                "39_0001_v_gLZ4F6tU8M4_c008.mp4",
                                "39_0001_v_iIxMOsCGH58_c006.mp4",
                                "39_0001_v_nCBFmf-_52o_c003.mp4",
                                "39_0001_v_UAbX7wld9vg_c001.mp4",
                                "39_0001_v_UAbX7wld9vg_c009.mp4",
                                "39_0002_v_4f6xLy4pop0_c015.mp4",
                                "39_0002_v_5uBLdpt5-oU_c029.mp4",
                                "39_0002_v_8szWGZUtwEM_c001.mp4",
                                "39_0002_v_COtwS_2SWIo_c011.mp4",
                                "39_0002_v_COtwS_2SWIo_c036.mp4",
                                "39_0002_v_COtwS_2SWIo_c600.mp4",
                                "39_0002_v_ervkVzoFJ5w_c010.mp4",
                                "39_0003_v_2QhNRucNC7E_c604.mp4",
                                "39_0003_v_COtwS_2SWIo_c011.mp4",
                                "39_0003_v_COtwS_2SWIo_c600.mp4",
                                "39_0003_v_ZcFbmdCxDhQ_c009.mp4",
                                "39_0004_v_2QhNRucNC7E_c602.mp4",
                                "39_0004_v_ervkVzoFJ5w_c005.mp4",
                                "39_0004_v_FFdQL8Ljrag_c005.mp4",
                                "39_0004_v_gQNyhv8y0QY_c012.mp4",
                                "39_0004_v_kEyqyYJhIIg_c002.mp4",
                                "39_0004_v_UAbX7wld9vg_c007.mp4",
                                "39_0005_v_GGhhbMOp6yY_c009.mp4",
                                "39_0006_v_FFdQL8Ljrag_c005.mp4",
                                "39_0007_v_aQawj_MHBu0_c005.mp4",
                                "39_0007_v_COtwS_2SWIo_c011.mp4",
                                "39_0008_v_GGhhbMOp6yY_c007.mp4",
                                "39_0009_v_eMdTsex8Cyw_c008.mp4",
                                "39_0010_v_4f6xLy4pop0_c035.mp4",
                                "39_0011_v_eMdTsex8Cyw_c003.mp4",
                                "39_0017_v_kEyqyYJhIIg_c007.mp4",
                                "40_0001_v_4f6xLy4pop0_c015.mp4",
                                "40_0001_v_8szWGZUtwEM_c010.mp4",
                                "40_0001_v_9wLBkU3fs0k_c028.mp4",
                                "40_0001_v_COtwS_2SWIo_c011.mp4",
                                "40_0001_v_COtwS_2SWIo_c600.mp4",
                                "40_0001_v_UAbX7wld9vg_c002.mp4",
                                "40_0001_v_UAbX7wld9vg_c009.mp4",
                                "43_0001_v_aQawj_MHBu0_c005.mp4",
                                #error with frames
                                "39_0004_v_kEyqyYJhIIg_c003.mp4",
                                "39_0003_v_kEyqyYJhIIg_c003.mp4",
                                "37_0001_v_UAbX7wld9vg_c006.mp4",
                                "35_0005_v_0mcffpH2VTw_c028.mp4",
                                "39_0003_v_bzgW-jpfuNA_c002.mp4"
                    ],

                    "test": [
                                "39_0001_v_07uuHJ9yXBw_c603.mp4",
                                "39_0004_v_07uuHJ9yXBw_c603.mp4",
                                "39_0006_v_8Ndq_1uw98w_c600.mp4",
                                "39_0009_v_07uuHJ9yXBw_c016.mp4",
                                "40_0001_v_-hhDbvY5aAM_c008.mp4",
                                "40_0001_v_U8AcLO8xmkg_c010.mp4",
                                #error with frames
                                "39_0006_v_ITo3sCnpw_k_c012.mp4"
                            ],

                    "val": [
                                "37_0001_v_DjtFlW2eHFI_c017.mp4",
                                "39_0002_v_07uuHJ9yXBw_c600.mp4",
                                "39_0003_v_1yHWGw8DH4A_c606.mp4",
                                "39_0005_v_ITo3sCnpw_k_c009.mp4",
                                "40_0001_v_U8AcLO8xmkg_c010.mp4",
                                "39_0009_v_07uuHJ9yXBw_c016.mp4",
                                "40_0001_v_-hhDbvY5aAM_c008.mp4",
                                #errors with frames
                                "39_0003_v_07uuHJ9yXBw_c022.mp4"
                    ],
                }

                    
        }

for category in errors:
    for splits in errors[category]:
        for video in errors[category][splits]:
           df_sports.loc[df_sports[df_sports["video_name"] == video].index, 'error'] = True

In [64]:
df_sports.to_csv('../metadata_sports_with_errors.csv',index=False)

In [65]:
df_sports = df_sports[df_sports['error'] == False].reset_index(drop=True)

In [66]:
df_sports = df_sports.iloc[:,:-1]

In [67]:
df_sports.to_csv('../metadata_sports.csv',index=False)

df_sports

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video,split,action
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4,train,support
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4,train,support
...,...,...,...,...,...,...,...,...
24895,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block
24896,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block
24897,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4,train,first pass
24898,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4,train,second pass


# 4. New data split into train / val / test

## Basketball

In [68]:
df_sports['new_split'] = 'train'

In [69]:
df_sports[(df_sports['category'] == 'basketball') & (df_sports['split'] == 'train')]['action'].value_counts()

action
pass                       1363
dribble                    1008
interfere shot              362
2-point shot                327
screen                      239
drive                       236
pick-and-roll defensive     221
3-point shot                169
sag                         168
defensive rebound           163
pass-inbound                114
offensive rebound            46
free throw                   34
pass steal                   29
block                        16
dribble steal                16
jump ball                    10
save                          3
Name: count, dtype: int64

In [70]:
df_sports[(df_sports['category'] == 'basketball') & (df_sports['split'] == 'test')]['action'].value_counts()

action
pass                       538
dribble                    379
2-point shot               150
interfere shot             126
drive                       88
screen                      74
pick-and-roll defensive     64
defensive rebound           63
sag                         57
pass-inbound                44
3-point shot                41
free throw                  16
pass steal                  16
offensive rebound           16
block                       14
dribble steal               13
jump ball                    8
save                         2
Name: count, dtype: int64

In [71]:
X = df_sports[(df_sports['category'] == 'basketball') & (df_sports['split'] == 'test')].iloc[:,:2]
y = df_sports[(df_sports['category'] == 'basketball') & (df_sports['split'] == 'test')].iloc[:,1]

X_val, X_test, y_val, y_test = train_test_split(
    X,y, test_size=0.5, stratify=y, random_state=42)

print(f"val:{len(X_val.index)}")
print(f"test:{len(X_test.index)}")

val:854
test:855


In [72]:
df_sports.loc[X_val.index,'new_split'] = 'val'
df_sports.loc[X_test.index,'new_split'] = 'test'

df_sports[df_sports['category'] == 'basketball']['new_split'].value_counts()

new_split
train    4524
test      855
val       854
Name: count, dtype: int64

## Football

In [73]:
df_sports[(df_sports['category'] == 'football') & (df_sports['split'] == 'train')]['action'].value_counts()

action
short pass      1762
trap            1504
press            496
dribble          447
tackle           329
steal            242
clearance        209
long pass        182
shoot            174
aerial duels     156
cross            153
block            125
diving           124
through pass     117
throw             63
Name: count, dtype: int64

In [74]:
df_sports[(df_sports['category'] == 'football') & (df_sports['split'] == 'test')]['action'].value_counts()

action
short pass      643
trap            522
press           151
dribble         140
tackle          138
steal           106
clearance        72
aerial duels     65
cross            65
long pass        59
shoot            51
through pass     41
diving           39
block            31
throw            18
Name: count, dtype: int64

In [75]:
X = df_sports[(df_sports['category'] == 'football') & (df_sports['split'] == 'test')].iloc[:,:2]
y = df_sports[(df_sports['category'] == 'football') & (df_sports['split'] == 'test')].iloc[:,1]

X_val, X_test, y_val, y_test = train_test_split(
    X,y, test_size=0.5, stratify=y, random_state=42)

print(f"val:{len(X_val.index)}")
print(f"test:{len(X_test.index)}")

val:1070
test:1071


In [76]:
df_sports.loc[X_val.index,'new_split'] = 'val'
df_sports.loc[X_test.index,'new_split'] = 'test'

df_sports[df_sports['category'] == 'football']['new_split'].value_counts()

new_split
train    6083
test     1071
val      1070
Name: count, dtype: int64

## Volleyball

In [77]:
df_sports[(df_sports['category'] == 'volleyball') & (df_sports['split'] == 'train')]['action'].value_counts()

action
block                  1065
spike                   493
second pass             486
serve                   395
defend                  390
first pass              370
adjust                  115
dink                     85
protect                  50
no offensive attack      49
save                     35
second attack            13
Name: count, dtype: int64

In [78]:
df_sports[(df_sports['category'] == 'volleyball') & (df_sports['split'] == 'test')]['action'].value_counts()

action
block                  397
spike                  181
second pass            175
defend                 155
serve                  125
first pass             119
adjust                  42
dink                    29
protect                 21
no offensive attack     17
save                    17
second attack           13
Name: count, dtype: int64

In [79]:
X = df_sports[(df_sports['category'] == 'volleyball') & (df_sports['split'] == 'test')].iloc[:,:2]
y = df_sports[(df_sports['category'] == 'volleyball') & (df_sports['split'] == 'test')].iloc[:,1]

X_val, X_test, y_val, y_test = train_test_split(
    X,y, test_size=0.5, stratify=y, random_state=42)

print(f"val:{len(X_val.index)}")
print(f"test:{len(X_test.index)}")

val:645
test:646


In [80]:
df_sports.loc[X_val.index,'new_split'] = 'val'
df_sports.loc[X_test.index,'new_split'] = 'test'

df_sports[df_sports['category'] == 'volleyball']['new_split'].value_counts()

new_split
train    3546
test      646
val       645
Name: count, dtype: int64

## Aerobic_gymnastics

In [81]:
df_sports[(df_sports['category'] == 'aerobic_gymnastics') & (df_sports['split'] == 'train')]['action'].value_counts()

action
bent leg(s) jump      492
support               437
illusion              409
straddle jump         404
scissors leap         366
explosive push up     329
pike jump             294
helicopter            278
turn                  272
straight jump         250
split jump            196
explosive support     160
leg circle             83
push up                74
split                  68
v support              36
horizontal support     21
balance turn           11
off axis jump           6
butterfly jump          3
kick jump               3
Name: count, dtype: int64

In [82]:
df_sports[(df_sports['category'] == 'aerobic_gymnastics') & (df_sports['split'] == 'test')]['action'].value_counts()

action
straddle jump         153
bent leg(s) jump      150
illusion              142
support               140
scissors leap         118
helicopter            105
turn                  103
split jump             97
explosive push up      92
pike jump              81
straight jump          66
explosive support      49
push up                32
split                  28
v support              20
leg circle             19
horizontal support     10
butterfly jump          3
off axis jump           3
balance turn            3
Name: count, dtype: int64

In [83]:
X = df_sports[(df_sports['category'] == 'aerobic_gymnastics') & (df_sports['split'] == 'test')].iloc[:,:2]
y = df_sports[(df_sports['category'] == 'aerobic_gymnastics') & (df_sports['split'] == 'test')].iloc[:,1]

X_val, X_test, y_val, y_test = train_test_split(
    X,y, test_size=0.5, stratify=y, random_state=42)

print(f"val:{len(X_val.index)}")
print(f"test:{len(X_test.index)}")

val:707
test:707


In [84]:
df_sports.loc[X_val.index,'new_split'] = 'val'
df_sports.loc[X_test.index,'new_split'] = 'test'

df_sports[df_sports['category'] == 'aerobic_gymnastics']['new_split'].value_counts()

new_split
train    4192
val       707
test      707
Name: count, dtype: int64

In [85]:
df_sports.to_csv('../metadata_sports.csv',index=False)

In [86]:
del df_sports

In [87]:
df_sports = pd.read_csv("../metadata_sports.csv")

df_sports

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video,split,action,new_split
0,01_0001_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up,train
1,01_0002_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,31,True,v_-hyYa8ijq-8_c001.mp4,train,explosive push up,train
2,01_0003_v_-hyYa8ijq-8_c001.mp4,1,aerobic_gymnastics,32,False,v_-hyYa8ijq-8_c001.mp4,train,explosive push up,train
3,05_0001_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,89,False,v_-hyYa8ijq-8_c001.mp4,train,support,train
4,05_0002_v_-hyYa8ijq-8_c001.mp4,5,aerobic_gymnastics,87,False,v_-hyYa8ijq-8_c001.mp4,train,support,train
...,...,...,...,...,...,...,...,...,...
24895,22_0001_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block,train
24896,22_0002_v_zvVg4RxkekU_c020.mp4,22,volleyball,15,False,v_zvVg4RxkekU_c020.mp4,train,block,train
24897,23_0001_v_zvVg4RxkekU_c020.mp4,23,volleyball,17,False,v_zvVg4RxkekU_c020.mp4,train,first pass,train
24898,26_0001_v_zvVg4RxkekU_c020.mp4,26,volleyball,24,False,v_zvVg4RxkekU_c020.mp4,train,second pass,train


In [88]:
for cat in df_sports['category'].unique():
    print(cat)
    print(len(df_sports[df_sports['category'] == cat]['source_video'].drop_duplicates()))
    print(df_sports[df_sports['category'] == cat][['source_video','split']].drop_duplicates()['split'].value_counts())

aerobic_gymnastics
537
split
train    391
test     146
Name: count, dtype: int64
basketball
526
split
train    379
test     147
Name: count, dtype: int64
football
534
split
train    402
test     132
Name: count, dtype: int64
volleyball
531
split
train    402
test     129
Name: count, dtype: int64


In [89]:
for cat in df_sports['category'].unique():
    print(cat)
    print(len(df_sports[df_sports['category'] == cat]['source_video'].drop_duplicates()))
    print(df_sports[df_sports['category'] == cat][['source_video','split']].drop_duplicates()['split'].value_counts())

aerobic_gymnastics
537
split
train    391
test     146
Name: count, dtype: int64
basketball
526
split
train    379
test     147
Name: count, dtype: int64
football
534
split
train    402
test     132
Name: count, dtype: int64
volleyball
531
split
train    402
test     129
Name: count, dtype: int64


In [90]:
df_label[['category','action']].groupby('category').count()

Unnamed: 0_level_0,action
category,Unnamed: 1_level_1
aerobic_gymnastics,21
basketball,18
football,15
volleyball,12


In [91]:
for cat in df_sports['category'].unique():
    print(cat)
    print(len(df_sports[df_sports['category'] == cat]))
    print(df_sports[df_sports['category'] == cat][['video_name','split']]['split'].value_counts())
    print(df_sports[df_sports['category'] == cat][['video_name','split']]['split'].value_counts() / len(df_sports[df_sports['category'] == cat]))
    print(df_sports[df_sports['category'] == cat]['frames'].max())
    print(df_sports[df_sports['category'] == cat]['frames'].min())
    print(df_sports[df_sports['category'] == cat]['frames'].mean())
    print(df_sports[df_sports['category'] == cat]['resize'].value_counts() / len(df_sports[df_sports['category'] == cat]))
    print()
    

aerobic_gymnastics
5606
split
train    4192
test     1414
Name: count, dtype: int64
split
train    0.74777
test     0.25223
Name: count, dtype: float64
151
5
37.25579735997146
resize
True     0.546736
False    0.453264
Name: count, dtype: float64

basketball
6233
split
train    4524
test     1709
Name: count, dtype: int64
split
train    0.725814
test     0.274186
Name: count, dtype: float64
438
2
23.64495427562971
resize
False    0.811808
True     0.188192
Name: count, dtype: float64

football
8224
split
train    6083
test     2141
Name: count, dtype: int64
split
train    0.739664
test     0.260336
Name: count, dtype: float64
223
3
18.47227626459144
resize
False    0.929961
True     0.070039
Name: count, dtype: float64

volleyball
4837
split
train    3546
test     1291
Name: count, dtype: int64
split
train    0.733099
test     0.266901
Name: count, dtype: float64
61
3
18.18958031837916
resize
False    0.91441
True     0.08559
Name: count, dtype: float64



In [92]:
df_sports[df_sports['category'] == 'basketball']
print(df_sports[df_sports['category'] == 'basketball'][['video_name','split']]['split'].value_counts())
print(df_sports[df_sports['category'] == 'basketball'][['video_name','split']]['split'].value_counts() / len(df_sports[df_sports['category'] == 'basketball']))
print(df_sports[df_sports['category'] == 'basketball']['frames'].max())
print(df_sports[df_sports['category'] == 'basketball']['frames'].min())
print(df_sports[df_sports['category'] == 'basketball']['frames'].mean())
print(df_sports[df_sports['category'] == 'basketball']['resize'].value_counts() / len(df_sports[df_sports['category'] == 'basketball']))

split
train    4524
test     1709
Name: count, dtype: int64
split
train    0.725814
test     0.274186
Name: count, dtype: float64
438
2
23.64495427562971
resize
False    0.811808
True     0.188192
Name: count, dtype: float64


In [93]:
df_sports[df_sports['category'] == 'basketball']

Unnamed: 0,video_name,action_idx,category,frames,resize,source_video,split,action,new_split
5606,48_0001_v_-6Os86HzwCs_c001.mp4,48,basketball,6,False,v_-6Os86HzwCs_c001.mp4,test,pass,val
5607,48_0002_v_-6Os86HzwCs_c001.mp4,48,basketball,4,False,v_-6Os86HzwCs_c001.mp4,test,pass,test
5608,48_0003_v_-6Os86HzwCs_c001.mp4,48,basketball,5,False,v_-6Os86HzwCs_c001.mp4,test,pass,val
5609,48_0004_v_-6Os86HzwCs_c001.mp4,48,basketball,6,False,v_-6Os86HzwCs_c001.mp4,test,pass,val
5610,50_0001_v_-6Os86HzwCs_c001.mp4,50,basketball,36,False,v_-6Os86HzwCs_c001.mp4,test,dribble,test
...,...,...,...,...,...,...,...,...,...
11834,57_0001_v_V4tIg2zSbWU_c010.mp4,57,basketball,6,False,v_V4tIg2zSbWU_c010.mp4,train,pass steal,train
11835,60_0001_v_V4tIg2zSbWU_c010.mp4,60,basketball,16,False,v_V4tIg2zSbWU_c010.mp4,train,pick-and-roll defensive,train
11836,60_0002_v_V4tIg2zSbWU_c010.mp4,60,basketball,11,False,v_V4tIg2zSbWU_c010.mp4,train,pick-and-roll defensive,train
11837,52_0001_v_V4tIg2zSbWU_c010.mp4,52,basketball,12,True,v_V4tIg2zSbWU_c010.mp4,train,2-point shot,train


In [94]:
print(f"Liczba klas: {len(df_sports[df_sports['category'] == 'basketball']['action'].unique())}")

df_sports[df_sports['category'] == 'basketball']['action'].value_counts()

Liczba klas: 18


action
pass                       1901
dribble                    1387
interfere shot              488
2-point shot                477
drive                       324
screen                      313
pick-and-roll defensive     285
defensive rebound           226
sag                         225
3-point shot                210
pass-inbound                158
offensive rebound            62
free throw                   50
pass steal                   45
block                        30
dribble steal                29
jump ball                    18
save                          5
Name: count, dtype: int64

In [95]:
df_sports[df_sports['category'] == 'basketball']['action'].value_counts() / len(df_sports[df_sports['category'] == 'basketball']['action']) * 100

action
pass                       30.498957
dribble                    22.252527
interfere shot              7.829296
2-point shot                7.652816
drive                       5.198139
screen                      5.021659
pick-and-roll defensive     4.572437
defensive rebound           3.625862
sag                         3.609819
3-point shot                3.369164
pass-inbound                2.534895
offensive rebound           0.994706
free throw                  0.802182
pass steal                  0.721964
block                       0.481309
dribble steal               0.465266
jump ball                   0.288785
save                        0.080218
Name: count, dtype: float64

In [96]:
df_sports[(df_sports['category'] == 'basketball') 
          & (df_sports['split'] == 'train')]['action'].value_counts() 

action
pass                       1363
dribble                    1008
interfere shot              362
2-point shot                327
screen                      239
drive                       236
pick-and-roll defensive     221
3-point shot                169
sag                         168
defensive rebound           163
pass-inbound                114
offensive rebound            46
free throw                   34
pass steal                   29
block                        16
dribble steal                16
jump ball                    10
save                          3
Name: count, dtype: int64

In [97]:
df_sports[(df_sports['category'] == 'basketball') 
          & (df_sports['split'] == 'test')]['action'].value_counts() 

action
pass                       538
dribble                    379
2-point shot               150
interfere shot             126
drive                       88
screen                      74
pick-and-roll defensive     64
defensive rebound           63
sag                         57
pass-inbound                44
3-point shot                41
free throw                  16
pass steal                  16
offensive rebound           16
block                       14
dribble steal               13
jump ball                    8
save                         2
Name: count, dtype: int64

In [98]:
df_sports[(df_sports['category'] == 'basketball') 
          & (df_sports['split'] == 'train')]['action'].value_counts() / len(df_sports[(df_sports['category'] == 'basketball') & 
                                                                                      (df_sports['split'] == 'train')]['action']) * 100

action
pass                       30.128205
dribble                    22.281167
interfere shot              8.001768
2-point shot                7.228117
screen                      5.282935
drive                       5.216622
pick-and-roll defensive     4.885057
3-point shot                3.735632
sag                         3.713528
defensive rebound           3.603006
pass-inbound                2.519894
offensive rebound           1.016799
free throw                  0.751547
pass steal                  0.641026
block                       0.353669
dribble steal               0.353669
jump ball                   0.221043
save                        0.066313
Name: count, dtype: float64

In [99]:
df_sports[df_sports['category'] == 'basketball']['action'].value_counts() / len(df_sports[df_sports['category'] == 'basketball']['action']) * 100

action
pass                       30.498957
dribble                    22.252527
interfere shot              7.829296
2-point shot                7.652816
drive                       5.198139
screen                      5.021659
pick-and-roll defensive     4.572437
defensive rebound           3.625862
sag                         3.609819
3-point shot                3.369164
pass-inbound                2.534895
offensive rebound           0.994706
free throw                  0.802182
pass steal                  0.721964
block                       0.481309
dribble steal               0.465266
jump ball                   0.288785
save                        0.080218
Name: count, dtype: float64

In [100]:
df_sports[(df_sports['category'] == 'basketball') 
          & (df_sports['new_split'] == 'val')]['action'].value_counts() 

action
pass                       269
dribble                    189
2-point shot                75
interfere shot              63
drive                       44
screen                      37
pick-and-roll defensive     32
defensive rebound           31
sag                         28
pass-inbound                22
3-point shot                21
offensive rebound            8
free throw                   8
pass steal                   8
dribble steal                7
block                        7
jump ball                    4
save                         1
Name: count, dtype: int64

In [101]:
df_sports[(df_sports['category'] == 'basketball') 
          & (df_sports['new_split'] == 'test')]['action'].value_counts() 

action
pass                       269
dribble                    190
2-point shot                75
interfere shot              63
drive                       44
screen                      37
defensive rebound           32
pick-and-roll defensive     32
sag                         29
pass-inbound                22
3-point shot                20
free throw                   8
offensive rebound            8
pass steal                   8
block                        7
dribble steal                6
jump ball                    4
save                         1
Name: count, dtype: int64