# RepCount dataset processing

This notebook cleans the dataset and creates a new csv file with name `annotation.csv` in current folder.

Advice: Don't trust the original annotations.

Known errors:
1. Video missing frames. (One video in bench_pressing)
2. Annotation of repetition is incorrect. (Many rows in bench_pressing)
3. Column `type` has typos.
4. Repetition unit is incorrect. Uses second rather than frame index. `stu6_68.mp4`.

In [1]:
import pandas as pd
import numpy as np
import os
from os.path import join as osj
from pathlib import Path
base = osj('../datasets/RepCount/')

In [2]:
train_csv = os.path.join(base, 'annotation/train.csv')
test_csv = os.path.join(base, 'annotation/test.csv')
val_csv = os.path.join(base, 'annotation/val.csv')

train_df = pd.read_csv(train_csv, index_col=0)
test_df = pd.read_csv(test_csv, index_col=0)
val_df = pd.read_csv(val_csv, index_col=0)

print(len(train_df), len(test_df), len(val_df))
print('total:', len(train_df) + len(test_df) + len(val_df))


train_df.dropna(subset=['count'], inplace=True)
train_df.head()


758 152 131
total: 1041


Unnamed: 0,type,name,count,L1,L2,L3,L4,L5,L6,L7,...,L293,L294,L295,L296,L297,L298,L299,L300,L301,L302
0,frontraise,train951.mp4,4.0,6.0,72.0,72.0,132.0,132.0,204.0,204.0,...,,,,,,,,,,
1,frontraise,train952.mp4,10.0,13.0,62.0,62.0,103.0,103.0,126.0,126.0,...,,,,,,,,,,
2,pullups,test1463.mp4,7.0,21.0,60.0,60.0,98.0,98.0,132.0,132.0,...,,,,,,,,,,
3,squant,test2340.mp4,4.0,0.0,86.0,86.0,155.0,155.0,218.0,219.0,...,,,,,,,,,,
4,front_raise,stu5_11.mp4,6.0,104.0,179.0,179.0,255.0,255.0,330.0,426.0,...,,,,,,,,,,


## CLEAN

In [3]:
# Mislabelled data. Should be situp.
train_df[train_df['name'] == 'stu2_48.mp4']
train_df.loc[train_df['name'] == 'stu2_48.mp4', 'type'] = 'situp'

In [4]:
def get_reps(df, name):
    reps = df[df['name'] == name].values[0][3:]
    reps = [r for r in reps if pd.notnull(r)]
    count = df[df['name'] == name].values[0][2]
    return reps

reps = get_reps(test_df, 'train1615.mp4')
print(len(reps))
print(reps)

20
[0.0, 25.0, 27.0, 57.0, 58.0, 85.0, 87.0, 117.0, 118.0, 148.0, 149.0, 179.0, 180.0, 208.0, 208.0, 239.0, 239.0, 267.0, 268.0, 297.0]


In [5]:
# Labelled frame is longer than actual frame length. I don't know why. 
# It does have 1841 frames. But only 1681 frames in rawframe dir.
# I know why. The video is broken.
train_df[train_df['name'] == 'stu1_10.mp4']
train_df = train_df[train_df['name'] != 'stu1_10.mp4']

In [6]:
print(train_df[train_df['name'] == 'stu6_5.mp4'].values[0][3:28])
train_df = train_df[train_df['name'] != 'stu6_5.mp4']
train_df.drop(train_df[train_df['name'] == 'stu4_3'].index, inplace=True)

[63.0 163.0 163.0 306.0 534.0 598.0 598.0 675.0 675.0 762.0 856.0 957.0
 1020.0 1105.0 1105.0 1186.0 1186.0 1186.0 1186.0 1303.0 1303.0 1398.0
 1399.0 1550.0 nan]


In [7]:
print(test_df[test_df['name'] == 'stu4_5.mp4'].values[0][30:50])
# why are there so many same frames in the repetition?
# And the video is situp, not bench press.
# Drop this.
test_df = test_df[test_df['name'] != 'stu4_5.mp4']

[342.0 343.0 362.0 362.0 379.0 379.0 403.0 403.0 418.0 418.0 446.0 446.0
 464.0 464.0 482.0 482.0 504.0 505.0 526.0 527.0]


In [8]:
# The reps are annotated in seconds. Convert to frames. Nope, drop this.
print(test_df[test_df['name'] == 'stu6_68.mp4'].values[0][0:20])
test_df.drop(test_df[test_df['name'] == 'stu6_68.mp4'].index, inplace=True)

['squat' 'stu6_68.mp4' 6 4.0 7.0 8.0 11.0 11.0 14.0 14.0 18.0 18.0 21.0
 23.0 26.0 nan nan nan nan nan]


## Original YouTube data

In [9]:
original_data = Path(base, 'original_data/filename_mapping.xlsx')
ori = pd.read_excel(original_data, index_col=0)
print(len(ori))
ori[ori['vid']=='REjznoKN8Q8'].head(10)

800


Unnamed: 0,class,vid_st,vid,stuid,rename
693,squat,REjznoKN8Q8_143.mp4,REjznoKN8Q8,8,stu8_69
694,squat,REjznoKN8Q8_188.mp4,REjznoKN8Q8,3,stu3_67
779,squat,REjznoKN8Q8_188.mp4,REjznoKN8Q8,8,stu8_75


In [10]:
def min2sec(t):
    m, s = t.split(':')
    return int(m)*60 + int(s)

d_ = dict()
download_link_dir = Path(base, 'original_data/DownloadLink')
for fname in os.listdir(download_link_dir):
    with open(Path(download_link_dir, fname), 'r') as f:
        lines = f.readlines()
        for line in lines:
            vid, st, et = line.strip().split()
            st = min2sec(st)
            et = min2sec(et)
            vid_st = f'{vid}_{st}.mp4'
            d_[vid_st] = et

for row in ori.itertuples():
    vid_st = row.vid_st
    if vid_st in d_:
        ori.at[row.Index, 'et'] = int(d_[vid_st])
        ori.at[row.Index, 'st'] = int(vid_st[12:-4])

In [11]:
# add column 'name' to ori
ori['name'] = ori['rename'].apply(lambda x: x+'.mp4')
ori.drop(columns=['class'], inplace=True)
ori[ori['et'].notna()].head()

Unnamed: 0,vid_st,vid,stuid,rename,et,st,name
0,1pzpaYrhdOo_13.mp4,1pzpaYrhdOo,1,stu1_0,22.0,13.0,stu1_0.mp4
1,1pzpaYrhdOo_93.mp4,1pzpaYrhdOo,4,stu4_0,105.0,93.0,stu4_0.mp4
2,7wDx6mZDxA8_0.mp4,7wDx6mZDxA8,3,stu3_0,12.0,0.0,stu3_0.mp4
3,DcRj7jtKhk4_52.mp4,DcRj7jtKhk4,10,stu10_0,62.0,52.0,stu10_0.mp4
4,FyGBSs1X1qM_105.mp4,FyGBSs1X1qM,3,stu3_1,114.0,105.0,stu3_1.mp4


In [12]:
# reps to one column

train_df['reps'] = train_df.apply(lambda x: ' '.join([str(int(y)) for y in x.values[3:] if pd.notna(y)]), axis=1)
val_df['reps'] = val_df.apply(lambda x: ' '.join([str(int(y)) for y in x.values[3:] if pd.notna(y)]), axis=1)
test_df['reps'] = test_df.apply(lambda x: ' '.join([str(int(y)) for y in x.values[3:] if pd.notna(y)]), axis=1)

train_df.drop(train_df.columns[3:-1], axis=1, inplace=True)
val_df.drop(val_df.columns[3:-1], axis=1, inplace=True)
test_df.drop(test_df.columns[3:-1], axis=1, inplace=True)

# train_df.apply(lambda x: print(int(x['count']), x['reps']), axis=1)

# train_df['reps'] = train_df.apply(lambda x: ' '.join([str(int(y)) for y in x['reps'] if y>=0]), axis=1)
# val_df['reps'] = val_df.apply(lambda x: ' '.join([str(int(y)) for y in x['reps'] if y>=0]), axis=1)
# test_df['reps'] = test_df.apply(lambda x: ' '.join([str(int(y)) for y in x['reps'] if y>=0 ]), axis=1)

train_df.at[train_df.index[1], 'reps']

'13 62 62 103 103 126 126 146 146 171 171 194 194 215 215 239 239 260 260 281'

In [13]:
# add YouTube ID and start time to annotation
def func(name):
    name = name.split('.')[0]
    row = ori[ori['rename'] == name]
    if len(row) == 0 or any(row['vid'].isna()):
        return None
    vid_st = row['vid_st'].values[0]

    vid = vid_st[:11]
    st = vid_st[12:-4]
    return vid, int(st)

train_df = train_df.merge(ori, how='left', on='name', suffixes=[None, None])
val_df = val_df.merge(ori, how='left', on='name', suffixes=[None, None])
test_df = test_df.merge(ori, how='left', on='name', suffixes=[None, None])

In [14]:
print(len(train_df[train_df['vid'].notna()]), len(val_df[val_df['vid'].notna()]), len(test_df[test_df['vid'].notna()]))

506 91 107


## Fixing the data
Many different names for the same thing.
Fixing it.

In [15]:
pushups = train_df[train_df['type'] == 'pushups']
pullups = train_df[train_df['type'] == 'pullups']
push_up = train_df[train_df['type'] == 'push_up']
pull_up = train_df[train_df['type'] == 'pull_up']

# replace column pushups to push_up, pullups to pull_up, squant to squat, jumpjacks to jump_jack
train_df['type'] = train_df['type'].replace(
    ['pushups', 'pullups', 'squant', 'jumpjacks', 'benchpressing', 'frontraise'],
    ['push_up', 'pull_up', 'squat', 'jump_jack', 'bench_pressing', 'front_raise'])

# val and test too
val_df['type'] = val_df['type'].replace(
    ['pushups', 'pullups', 'squant', 'jumpjacks', 'benchpressing', 'frontraise'],
    ['push_up', 'pull_up', 'squat', 'jump_jack', 'bench_pressing', 'front_raise'])

test_df['type'] = test_df['type'].replace(
    ['pushups', 'pullups', 'squant', 'jumpjacks', 'benchpressing', 'frontraise'],
    ['push_up', 'pull_up', 'squat', 'jump_jack', 'bench_pressing', 'front_raise'])


In [16]:
classes = train_df['type'].unique()
df1 = train_df.groupby('type').count()
df1 = df1[['count']].rename(columns={'count': 'count_train'})

df2 = val_df.groupby('type').count()
df2 = df2[['count']].rename(columns={'count': 'count_val'})

df3 = test_df.groupby('type').count()
df3 = df3[['count']].rename(columns={'count': 'count_test'})

# show df1 df2 df3 in one table
df = pd.concat([df1, df2, df3], axis=1)
df.index.name = 'type'
df.fillna(0, inplace=True)
df = df.astype(int)
df = df.reset_index()
df.sort_values('count_train', ascending=False)

Unnamed: 0,type,count_train,count_val,count_test
9,squat,101,16,17
6,pull_up,94,14,19
8,situp,94,18,20
2,front_raise,93,19,18
1,bench_pressing,91,13,18
7,push_up,87,18,16
3,jump_jack,76,15,26
5,pommelhorse,69,15,15
4,others,37,0,1
0,battle_rope,13,3,0


### Save all to new csv

Remove ['battle_rope', 'others', 'pommelhorse']

In [17]:
train_df['split'] = 'train'
val_df['split'] = 'val'
test_df['split'] = 'test'
all_df = pd.concat([train_df, val_df, test_df], axis=0)
all_df.reset_index(inplace=True) # Important!

In [18]:
# drop types battle_rope, others, pommelhorse
to_drop = all_df[all_df['type'].isin(['battle_rope', 'others', 'pommelhorse'])].index
all_df.drop(to_drop, inplace=True)
all_df.rename(columns={'type': 'class_', 'st': 'start', 'et': 'end'}, inplace=True)
# reorder columns
keep_cols = ['class_', 'split', 'name', 'vid', 'start', 'end', 'count', 'reps']

all_df = all_df[keep_cols]

all_df[all_df['split']=='train']

Unnamed: 0,class_,split,name,vid,start,end,count,reps
0,front_raise,train,train951.mp4,,,,4.0,6 72 72 132 132 204 204 271
1,front_raise,train,train952.mp4,,,,10.0,13 62 62 103 103 126 126 146 146 171 171 194 1...
2,pull_up,train,test1463.mp4,,,,7.0,21 60 60 98 98 132 132 172 172 210 210 246 246...
3,squat,train,test2340.mp4,,,,4.0,0 86 86 155 155 218 219 285
4,front_raise,train,stu5_11.mp4,0_b43RbenY8,34.0,70.0,6.0,104 179 179 255 255 330 426 501 703 789 790 853
...,...,...,...,...,...,...,...,...
648,push_up,train,stu1_44.mp4,3Gz_k31lazA,85.0,115.0,6.0,172 236 236 303 303 396 396 478 478 557 557 638
649,squat,train,stu9_66.mp4,Eir-KRad0zw,120.0,180.0,35.0,9 55 55 103 103 150 150 197 197 248 248 297 29...
650,squat,train,train3912.mp4,,,,4.0,5 87 87 137 137 202 203 249
651,push_up,train,stu6_40.mp4,h5gWnuYEceY,32.0,85.0,8.0,59 151 155 262 265 364 364 463 891 978 981 108...


In [19]:
print('val', len(all_df[all_df['split']=='val']))
print('test', len(all_df[all_df['split']=='test']))

val 113
test 134


In [20]:
all_df.to_csv(Path(base, 'annotation.csv'), index=False)
print(len(all_df))
print(all_df.tail(1))

883
       class_ split         name          vid  start    end  count  \
1020  pull_up  test  stu1_36.mp4  ok0F_-opRFo  104.0  140.0    9.0   

                                                   reps  
1020  52 109 109 165 165 245 245 315 840 880 880 935...  


In [21]:
load_df = pd.read_csv(Path(base, 'annotation.csv'))
print(len(load_df))

883


In [22]:
all_df['split'] == 'train'

0        True
1        True
2        True
3        True
4        True
        ...  
1016    False
1017    False
1018    False
1019    False
1020    False
Name: split, Length: 883, dtype: bool

## Add FPS column

In [24]:
from torchvision.io import VideoReader

data_root = os.path.expanduser('~/data/RepCount/videos')
fps_dict = {}
for row in all_df.itertuples():
    path = os.path.join(data_root,row.split, row.name)
    vid = VideoReader(path)
    meta = vid.get_metadata()
    fps = meta['video']['fps'][0]
    fps_dict[row.name] = fps

all_df['fps'] = all_df['name'].apply(lambda x: fps_dict[x])
all_df.to_csv(Path(base, 'annotation.csv'))

print(all_df.head(1))

        class_  split          name  vid  start  end  count  \
0  front_raise  train  train951.mp4  NaN    NaN  NaN    4.0   

                          reps   fps  
0  6 72 72 132 132 204 204 271  30.0  
