In [1]:
import os
import pandas as pd 
import cv2

In [38]:
def get_label_gloss_mappings(df):
    """
    Creates two dictionaries:
    - label_to_gloss: maps label index to gloss name
    - gloss_to_label: maps gloss name to label index

    Args:
        df (pd.DataFrame): DataFrame with 'gloss' and 'label' columns.

    Returns:
        dict, dict: (label_to_gloss, gloss_to_label)
    """
    label_to_gloss = dict(df[['label', 'Gloss']].drop_duplicates().sort_values('label').values)
    gloss_to_label = dict(df[['Gloss', 'label']].drop_duplicates().sort_values('label').values)
    return label_to_gloss, gloss_to_label

In [48]:
dataset_root = os.path.join('datasets', 'ASL-Citizen')
ttv = ['train.csv', 'test.csv', 'val.csv']
train, test, val = [pd.read_csv(os.path.join(dataset_root, i)) for i in ttv]

In [49]:
test.head()

Unnamed: 0,Participant ID,Video file,Gloss,ASL-LEX Code,label,fpath,prep_fpath,kpts_fpath
0,P48,46041911247233713-HAMMER.mp4,HAMMER,C_02_021,56,datasets\ASL-Citizen\top100_videos\46041911247233713-HAMMER.mp4,datasets\ASL-Citizen\preprocess_videos\46041911247233713-HAMMER.npy,datasets\ASL-Citizen\keypoints\46041911247233713-HAMMER.json
1,P7,014770041748456197-MOVIE.mp4,MOVIE,B_03_077,64,datasets\ASL-Citizen\top100_videos\014770041748456197-MOVIE.mp4,datasets\ASL-Citizen\preprocess_videos\014770041748456197-MOVIE.npy,datasets\ASL-Citizen\keypoints\014770041748456197-MOVIE.json
2,P42,8372181221997397-SERVE 1.mp4,SERVE,E_02_004,76,datasets\ASL-Citizen\top100_videos\8372181221997397-SERVE 1.mp4,datasets\ASL-Citizen\preprocess_videos\8372181221997397-SERVE 1.npy,datasets\ASL-Citizen\keypoints\8372181221997397-SERVE 1.json
3,P29,6422461467297818-CIGARETTE.mp4,CIGARETTE,C_01_059,31,datasets\ASL-Citizen\top100_videos\6422461467297818-CIGARETTE.mp4,datasets\ASL-Citizen\preprocess_videos\6422461467297818-CIGARETTE.npy,datasets\ASL-Citizen\keypoints\6422461467297818-CIGARETTE.json
4,P18,04600046798432267-THAT.mp4,THAT,C_03_059,94,datasets\ASL-Citizen\top100_videos\04600046798432267-THAT.mp4,datasets\ASL-Citizen\preprocess_videos\04600046798432267-THAT.npy,datasets\ASL-Citizen\keypoints\04600046798432267-THAT.json


In [50]:
from data_loader import MSASLKeypointsDataset16Frames

In [51]:
get_fname = lambda x: os.path.basename(x).split('.')[0]
add_ext = lambda x, ext = '.json': x + ext

train_fnames = train['fpath'].apply(get_fname).apply(add_ext)
val_fnames = val['fpath'].apply(get_fname).apply(add_ext)
test_fnames = test['fpath'].apply(get_fname).apply(add_ext)
root_dir = os.path.join(dataset_root, 'top100_videos_16frames_kpts')

In [52]:
train_fnames[0]

'15252109051698337-NOON.json'

In [53]:
train_ds = MSASLKeypointsDataset16Frames(
    root_dir,
    train_fnames,
    train.label.to_list()
    )

valid_ds = MSASLKeypointsDataset16Frames(
    root_dir,
    val_fnames,
    val.label.to_list()
)

test_ds = MSASLKeypointsDataset16Frames(
    root_dir,
    test_fnames,
    test.label.to_list()
)

In [7]:
train_ds[0][0]

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  3.2589e-01,
          1.1696e+00, -1.9779e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  2.8571e-01,
          1.0402e+00, -2.1250e+00],
        [ 2.8571e-01,  9.6429e-01,  7.6848e-08,  ...,  2.9911e-01,
          9.0625e-01, -1.7657e+00],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  3.4821e-01,
          9.4196e-01, -2.1277e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  3.4375e-01,
          9.5536e-01, -2.1906e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  3.3036e-01,
          9.5089e-01, -2.1455e+00]])

In [8]:
from fastai.data.core import DataLoaders
from fastai.callback.tensorboard import TensorBoardCallback
from fastai.callback.tracker import SaveModelCallback

In [9]:
import torch
import torch.nn as nn
from fastai.vision.all import *

class BiGRUWrapper(nn.Module):
    def __init__(self, input_size=144, hidden_size=256, num_layers=1, num_classes=100, bidirectional=True, dropout=0.3):
        super().__init__()
        self.gru = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if num_layers > 1 else 0
        )
        direction_factor = 2 if bidirectional else 1
        self.head = nn.Sequential(
            nn.Linear(hidden_size * direction_factor, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):  # x: (B, T, F) = (batch, frames, keypoints*3)
        out, _ = self.gru(x)
        return self.head(out[:, -1, :])


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
logs_dir = os.path.join('logs')
model_name = 'gru_bi'
os.makedirs('models', exist_ok=True)
os.makedirs(logs_dir, exist_ok=True)
tb_logger = TensorBoardCallback(log_dir=logs_dir, trace_model=False, log_preds=False)
save_model = SaveModelCallback(monitor='valid_loss', fname= model_name, with_opt=True)

In [18]:
model = BiGRUWrapper(input_size=144, num_classes=100).cuda()
dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=64, shuffle=True).cuda()
learn = Learner(
    dls,  
    model,
    loss_func=CrossEntropyLossFlat(),
    metrics=accuracy,
    cbs=[tb_logger, save_model],
).to_fp16()

get_available_device_name = lambda learn : next(learn.model.parameters()).device
get_available_dtype = lambda learn : next(learn.model.parameters()).dtype

torch.backends.cuda.matmul.allow_tf32 = True
learn.create_opt()
# learn.summary()

In [19]:
print(get_available_device_name(learn))
print(get_available_dtype(learn))  # Should show torch.float32
print(learn.opt)  # Should show "MixedPrecision" in callback list
print(learn.dls.device)
print(dls.one_batch()[0].device)

cuda:0
torch.float32
<fastai.optimizer.Optimizer object at 0x00000218939FD3D0>
cuda:0
cuda:0


In [20]:
num_epoch = 100
lr_max = 1e-3
torch.cuda.empty_cache()
learn.fit_one_cycle(num_epoch, lr_max)

epoch,train_loss,valid_loss,accuracy,time
0,4.607244,4.60699,0.014706,00:02
1,4.607155,4.606916,0.014706,00:02
2,4.606907,4.606805,0.009804,00:02
3,4.607162,4.606573,0.009804,00:02
4,4.606741,4.606383,0.014706,00:02
5,4.606306,4.605914,0.014706,00:02
6,4.605752,4.604316,0.014706,00:02
7,4.593834,4.536,0.02451,00:02
8,4.484616,4.318338,0.02451,00:02
9,4.354177,4.212749,0.02451,00:02


Better model found at epoch 0 with valid_loss value: 4.606989860534668.
Better model found at epoch 1 with valid_loss value: 4.606916427612305.
Better model found at epoch 2 with valid_loss value: 4.606805324554443.
Better model found at epoch 3 with valid_loss value: 4.60657262802124.
Better model found at epoch 4 with valid_loss value: 4.606383323669434.
Better model found at epoch 5 with valid_loss value: 4.605914115905762.
Better model found at epoch 6 with valid_loss value: 4.604316234588623.
Better model found at epoch 7 with valid_loss value: 4.5360002517700195.
Better model found at epoch 8 with valid_loss value: 4.318337917327881.
Better model found at epoch 9 with valid_loss value: 4.212749004364014.
Better model found at epoch 10 with valid_loss value: 4.203655242919922.
Better model found at epoch 11 with valid_loss value: 4.197585582733154.
Better model found at epoch 12 with valid_loss value: 4.178530216217041.
Better model found at epoch 13 with valid_loss value: 4.16988

The baseline model gave the highest accuracy of 48%

In [None]:
lbl_to_gloss,_ = get_label_gloss_mappings(train)

Validation

In [29]:
preds, lbl = learn.get_preds()
p, t = torch.argmax(preds, dim=1).tolist(), lbl.tolist()
inference = pd.DataFrame({'pred': p, 'true': t})
accuracy(preds, lbl)

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


TensorBase(0.3775)

In [45]:
top_preds = inference[inference.pred == inference.true].groupby('true').size().reset_index(name = 'count').sort_values('count', ascending=False).head(10)
top_preds['gloss'] = top_preds['true'].apply(lambda x: lbl_to_gloss[x])
top_preds

Unnamed: 0,true,count,gloss
1,2,2,BASKETBALL
5,10,2,BOWL
4,5,2,BELT
6,12,2,BOY
21,38,2,DEAF
19,36,2,CONFUSED
28,49,2,FINE
18,35,2,CLOUD
58,95,2,THEY
57,94,2,THAT


Test

In [57]:
test_dl = learn.dls.test_dl(test_ds, shuffle=False)
preds, lbl = learn.get_preds(dl = test_dl)
p, t = torch.argmax(preds, dim=1).tolist(), lbl.tolist()
inference = pd.DataFrame({'pred': p, 'true': t})
accuracy(preds, lbl)

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


TensorBase(0.3971)

In [60]:
top_preds = inference[inference.pred == inference.true].groupby('true').size().reset_index(name = 'count').sort_values('count', ascending=False).head(30)
top_preds['gloss'] = top_preds['true'].apply(lambda x: lbl_to_gloss[x])
top_preds

Unnamed: 0,true,count,gloss
1,2,2,BASKETBALL
4,7,2,BLOOD
3,5,2,BELT
12,22,2,CEMETERY
11,20,2,CATEGORY
9,18,2,CANCER
21,37,2,DARK
19,33,2,CLEAR
13,24,2,CHANNEL
44,74,2,SANDWICH


In [61]:
inference[inference.true == 2 ]

Unnamed: 0,pred,true
44,2,2
80,2,2


Train

In [63]:
preds, lbl = learn.get_preds(dl = dls)
p, t = torch.argmax(preds, dim=1).tolist(), lbl.tolist()
inference = pd.DataFrame({'pred': p, 'true': t})
accuracy(preds, lbl)

  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()
  self.autocast,self.learn.scaler,self.scales = autocast(dtype=dtype),GradScaler(**self.kwargs),L()


TensorBase(0.6218)

In [64]:
top_preds = inference[inference.pred == inference.true].groupby('true').size().reset_index(name = 'count').sort_values('count', ascending=False).head(30)
top_preds['gloss'] = top_preds['true'].apply(lambda x: lbl_to_gloss[x])
top_preds

Unnamed: 0,true,count,gloss
30,30,32,CHRISTMAS
5,5,32,BELT
44,44,32,DOWNSIZE
0,0,31,AXE
39,39,31,DECIDE
43,43,31,DOG
37,37,31,DARK
99,99,31,WHATFOR
35,35,31,CLOUD
14,14,28,BRAVE


In [67]:
27 / inference[inference.true == 2 ].shape[0]

0.6923076923076923

In [69]:
32 / inference[inference.true == 30 ].shape[0], inference[inference.true == 30 ].shape[0]

(0.9142857142857143, 35)

In [75]:
inference[(inference.pred == 30) & (inference.true != 30)].shape[0] 

19

In [76]:
inference[(inference.pred == 30) & (inference.true != 30)].shape[0] 

19

In [77]:
inference[(inference.pred == 2) & (inference.true != 2)].shape[0] 

5