#### Kaggle

In [60]:
from tqdm import tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import shutil

In [5]:
metadata_dir = '/kaggle/input/asl-citizen/ASL_Citizen/splits'
videos = '/kaggle/input/asl-citizen/ASL_Citizen/videos'
val_data, train_data, test_data = [os.path.join(metadata_dir, d) for d in os.listdir(metadata_dir)]

In [7]:
train_df = pd.read_csv(train_data)
train_df.shape

(40154, 4)

In [8]:
train_df.head()

Unnamed: 0,Participant ID,Video file,Gloss,ASL-LEX Code
0,P1,15890366051589533-APPLE.mp4,APPLE,A_03_054
1,P1,35618482303951104-IMPOSSIBLE.mp4,IMPOSSIBLE,B_01_032
2,P1,6958143575951994-PARK.mp4,PARK,E_03_028
3,P1,8006032738002744-SOCCER 2.mp4,SOCCER2,F_03_032
4,P1,37542279833186454-STINK.mp4,STINK,H_01_064


In [10]:
train_df.Gloss.sort_values().unique()

array(['1DOLLAR', '5DOLLARS', '8HOUR', ..., 'ZOO', 'ZOOMIN', 'ZOOMOFF'],
      dtype=object)

In [13]:
frequency = train_df.groupby('Gloss').size().reset_index(name = 'num_videos').sort_values(by='num_videos', ascending = False)
frequency

Unnamed: 0,Gloss,num_videos
670,DOG1,24
1198,HURDLE/TRIP1,22
209,BITE1,21
283,BREAKFAST1,21
609,DEMAND1,21
...,...,...
752,EDIT2,10
2523,TWINS2,10
1466,MECHANIC2,9
2372,TAKEOFF2,9


Since the split is not proper we'll create a new split that has equal number of videos for val and test and give more videos to train. 

In [14]:
val_df = pd.read_csv(val_data)
test_df = pd.read_csv(test_data)

In [15]:
val_df.shape

(10304, 4)

In [16]:
test_df.shape

(32941, 4)

In [18]:
df = pd.concat([train_df, test_df, val_df])
df.shape

(83399, 4)

In [19]:
df.head()

Unnamed: 0,Participant ID,Video file,Gloss,ASL-LEX Code
0,P1,15890366051589533-APPLE.mp4,APPLE,A_03_054
1,P1,35618482303951104-IMPOSSIBLE.mp4,IMPOSSIBLE,B_01_032
2,P1,6958143575951994-PARK.mp4,PARK,E_03_028
3,P1,8006032738002744-SOCCER 2.mp4,SOCCER2,F_03_032
4,P1,37542279833186454-STINK.mp4,STINK,H_01_064


In [22]:
col_name = 'num_videos'
frequency = df.groupby('Gloss').size().reset_index(name=col_name).sort_values(col_name, ascending = False)
frequency

Unnamed: 0,Gloss,num_videos
670,DOG1,45
156,BASKETBALL1,44
2643,WHATFOR1,43
189,BELT1,40
582,DARK1,39
...,...,...
1058,GUESS2,22
2530,TYPE2,22
468,CLOUD2,22
1363,LETTUCE2,21


In [30]:
num_of_signs = 100
list_to_str = lambda li, join_by = ', ': join_by.join(li)
top_signs = frequency.head(num_of_signs).sort_values(by='Gloss')
list_to_str(top_signs.Gloss.to_list())

'AXE1, BACKPACK1, BASKETBALL1, BEE1, BELIEVE1, BELT1, BITE1, BLOOD, BORROW, BOTTLE, BOWL, BOXING, BOY, BRAIDS, BRAVE, BREAKFAST1, CALENDAR1, CANCEL1, CANCER1, CASTLE4, CATEGORY, CATHOLIC, CEMETERY, CHAIN, CHANNEL, CHARACTER, CHASE, CHEEK, CHEESEGRATER, CHOCOLATE, CHRISTMAS1, CIGARETTE, CITY1, CLEAR, CLOSE, CLOUD1, CONFUSED1, DARK1, DEAF1, DECIDE1, DEMAND1, DEVELOP1, DINNER1, DOG1, DOWNSIZE1, DRAG1, EAT1, EDIT1, ELEVATOR1, FINE1, FOREIGNER1, GIFT, GREECE, GREEN, GUESS1, HALLOWEEN1, HAMMER, HOSPITAL1, HURDLE/TRIP1, LETTUCE1, LOCK1, LUNCH1, MECHANIC1, MICROSCOPE1, MOVIE1, NIGHT1, NOON1, PARTY1, PATIENT2, RECENT1, RESEARCH1, RIVER1, ROCKINGCHAIR1, SAME2, SANDWICH2, SCARED, SERVE1, SEW, SHAVE1, SHINY, SHOP2, SINK, SKATE, SPECIAL1, STEAL, STICKY, STOMACH, STRANGE, SURPRISE, SUSPECT, TAKEOFF1, TEAM, TEMPTATION, TEXT, THAT, THEY1, THIRD1, TWINS1, TYPE1, WHATFOR1'

In [42]:
strnums = list(map(str, range(0,10)))
clean_gloss = lambda x: x[:-1] if x[-1] in strnums else x
top_100_signs_cleaned = [clean_gloss(g) for g in top_signs.Gloss.to_list()]
top_100_signs = top_signs.Gloss.to_list()
print(list_to_str(top_100_signs))
print()
print(list_to_str(top_100_signs_cleaned))

AXE1, BACKPACK1, BASKETBALL1, BEE1, BELIEVE1, BELT1, BITE1, BLOOD, BORROW, BOTTLE, BOWL, BOXING, BOY, BRAIDS, BRAVE, BREAKFAST1, CALENDAR1, CANCEL1, CANCER1, CASTLE4, CATEGORY, CATHOLIC, CEMETERY, CHAIN, CHANNEL, CHARACTER, CHASE, CHEEK, CHEESEGRATER, CHOCOLATE, CHRISTMAS1, CIGARETTE, CITY1, CLEAR, CLOSE, CLOUD1, CONFUSED1, DARK1, DEAF1, DECIDE1, DEMAND1, DEVELOP1, DINNER1, DOG1, DOWNSIZE1, DRAG1, EAT1, EDIT1, ELEVATOR1, FINE1, FOREIGNER1, GIFT, GREECE, GREEN, GUESS1, HALLOWEEN1, HAMMER, HOSPITAL1, HURDLE/TRIP1, LETTUCE1, LOCK1, LUNCH1, MECHANIC1, MICROSCOPE1, MOVIE1, NIGHT1, NOON1, PARTY1, PATIENT2, RECENT1, RESEARCH1, RIVER1, ROCKINGCHAIR1, SAME2, SANDWICH2, SCARED, SERVE1, SEW, SHAVE1, SHINY, SHOP2, SINK, SKATE, SPECIAL1, STEAL, STICKY, STOMACH, STRANGE, SURPRISE, SUSPECT, TAKEOFF1, TEAM, TEMPTATION, TEXT, THAT, THEY1, THIRD1, TWINS1, TYPE1, WHATFOR1

AXE, BACKPACK, BASKETBALL, BEE, BELIEVE, BELT, BITE, BLOOD, BORROW, BOTTLE, BOWL, BOXING, BOY, BRAIDS, BRAVE, BREAKFAST, CALENDAR, CA

In [43]:
top_signs_df = df[df.Gloss.isin(top_100_signs)]
top_signs_df

Unnamed: 0,Participant ID,Video file,Gloss,ASL-LEX Code
18,P1,4520498201410337-BACKPACK.mp4,BACKPACK1,G_03_091
33,P1,44939873429204336-CEMETERY.mp4,CEMETERY,H_01_037
77,P1,7809503445047001-BREAKFAST.mp4,BREAKFAST1,J_01_053
95,P1,49288220098651747-DOG.mp4,DOG1,A_01_056
120,P40,8875256912597131-BACKPACK.mp4,BACKPACK1,G_03_091
...,...,...,...,...
10221,P26,2632949274164049-CHEESE GRATER.mp4,CHEESEGRATER,F_03_064
10230,P26,4672262692116116-CHEEK.mp4,CHEEK,G_03_052
10232,P26,5658562793078117-CLOSE.mp4,CLOSE,G_03_084
10252,P26,9474917526716478-BASKETBALL.mp4,BASKETBALL1,A_03_045


In [48]:
print('on average :' , top_signs_df.shape[0] / 100, 'videos per Gloss.')

on average : 35.44 videos per Gloss.


In [51]:
top_signs_df.groupby('Participant ID').size().sort_values(ascending = False)

Participant ID
P40    185
P35    178
P6     177
P37    173
P42    170
P9     133
P18    121
P47    121
P15    115
P48    114
P49    110
P31    110
P11    110
P50    109
P33    108
P22    107
P39    106
P17    105
P27    105
P21    105
P26    103
P52    100
P29     98
P30     83
P16     77
P14     76
P7      76
P51     72
P12     55
P43     43
P36     34
P5      28
P23     21
P20     20
P34     17
P4      17
P46     12
P28     11
P25      8
P38      6
P2       5
P1       4
P3       4
P24      4
P32      4
P10      3
P41      1
dtype: int64

In [55]:
top_signs_df.to_csv('top_100_signs.csv', index = False)

In [57]:
vid_fname = 'videos'
os.makedirs(vid_fname)

In [61]:
for _, row in tqdm(top_signs_df.iterrows(), desc = 'Copying files from input folder to output folder'):
    vname = row['Video file']
    current_vpath = os.path.join(videos, vname)
    save_path = os.path.join(vid_fname, vname)
    if os.path.exists(current_vpath):
        shutil.copy(current_vpath, save_path)

Copying files from input folder to output folder: 3544it [01:14, 47.44it/s]


In [62]:
shutil.make_archive('top100_videos', 'zip', vid_fname)

'/kaggle/working/top100_videos.zip'

##### Local 

In [1]:
import os 
import pandas as pd 

In [4]:
dataset_root = os.path.join('datasets', 'ASL-Citizen')
video_root = os.path.join(dataset_root, 'top100_videos')
df_path = os.path.join(dataset_root, 'top_100_signs.csv')
df = pd.read_csv(df_path)
df.head()

Unnamed: 0,Participant ID,Video file,Gloss,ASL-LEX Code
0,P1,4520498201410337-BACKPACK.mp4,BACKPACK1,G_03_091
1,P1,44939873429204336-CEMETERY.mp4,CEMETERY,H_01_037
2,P1,7809503445047001-BREAKFAST.mp4,BREAKFAST1,J_01_053
3,P1,49288220098651747-DOG.mp4,DOG1,A_01_056
4,P40,8875256912597131-BACKPACK.mp4,BACKPACK1,G_03_091


In [5]:
strnums = list(map(str, range(0,10)))
clean_gloss = lambda x: x[:-1] if x[-1] in strnums else x
df['Gloss'] = df['Gloss'].apply(clean_gloss)
df.head()

Unnamed: 0,Participant ID,Video file,Gloss,ASL-LEX Code
0,P1,4520498201410337-BACKPACK.mp4,BACKPACK,G_03_091
1,P1,44939873429204336-CEMETERY.mp4,CEMETERY,H_01_037
2,P1,7809503445047001-BREAKFAST.mp4,BREAKFAST,J_01_053
3,P1,49288220098651747-DOG.mp4,DOG,A_01_056
4,P40,8875256912597131-BACKPACK.mp4,BACKPACK,G_03_091


In [6]:
df.to_csv('top_100_signs_cleaned.csv', index = False)