In [1]:
# Jupiter  MacOS
BASE_DIR = "/Users/johnhanratty/ASLtest/asl-signs"  #"/Users/johnhanratty/ASLtest/asl-signs"
WORKING_DIR = "/Users/johnhanratty/ASLtest"
ARCHIVE_DIR = "/Users/johnhanratty/ASLtest"
MODEL_DIR = "/Users/johnhanratty/ASLtest/models"

# !pip install nb_black --quiet
# %load_ext lab_black

# Colab
# BASE_DIR = "/content/asl-signs"   #"/content/drive/MyDrive/GaggleSignLang/asl-signs"
# WORKING_DIR = "/content/asl-work"
# ARCHIVE_DIR = "/content/drive/MyDrive/GaggleSignLang"
# MODEL_DIR = "/content/drive/MyDrive/GaggleSignLang"
# !pip install nb_black --quiet
# print('-----ok')
# %load_ext nb_black

# KAGGLE
# BASE_DIR = "/kaggle/input/asl-signs"
# WORKING_DIR = "/kaggle/working"
# ARCHIVE_DIR = "/kaggle/working"
# MODEL_DIR  = "/kaggle/working"
# !pip install nb_black --quiet --root-user-action=ignore
# %load_ext lab_black

import os
import gc
import shutil
import time

import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import pickle
from random import seed, sample

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize

import warnings
warnings.filterwarnings(action='ignore')

LANDMARK_FILES_DIR = f'{BASE_DIR}/train_landmark_files'
TRAIN_FILE = f"{BASE_DIR}/train.csv"

FRAMES_OUT = 32 # 16
PTS_IN_FRAME = 345
DIMC = [0,1,2]
DIMS = len(DIMC)
WORKERS = 0   # dataoader work var  0 for MAC, 4 for online

print('done')

done


In [2]:
ROWS_PER_FRAME = 543  # number of landmarks per frame

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [7]:
# CNN TORCH FEATUREGEN MODEL 
ROWS_PER_FRAME = 543  # combined face, lefth, pose, righth
PR_PTS = [40, 44, 48, 52, 56, 60, 43, 46, 50, 54, 58]
SC_PTS = [40, 98, 102, 106, 110, 114, 97, 102, 106, 110, 114]
PO_PTS = [60, 73, 80, 81, 76, 77, 68, 69, 70, 71, 75, 74]
LI_PTS = [5, 0, 4, 8, 12, 16, 20, 24, 28, 32, 36]
PR_LEN = len(PR_PTS) * DIMS * 5  # 5 = number of aggregations e.g. max,min
SC_LEN = len(SC_PTS) * DIMS * 5
PO_LEN = len(PO_PTS) * DIMS * 5
LI_LEN = len(LI_PTS) * DIMS * 5
CNN_FEAT_LEN = PR_LEN + SC_LEN + PO_LEN + LI_LEN 

print("Feature Len", CNN_FEAT_LEN, PR_LEN, SC_LEN, PO_LEN, LI_LEN)

# FILTER FEATURES IN EACH FRAME  - FACE, POSE & HANDs
class FeatureGen(nn.Module):
    def __init__(self):
        super(FeatureGen, self).__init__()
        pass
    
    def forward(self, x):
        x = torch.tensor(x)
        
        # FILTER TO SPECIFIED FRAMES (FRAMES_OUT)
        seed(24)
        n_frames = x.size()[0]
        # Trim to # of frames to FRAMES_OUT
        if n_frames > FRAMES_OUT:
            idx = sorted((sample(range(0, n_frames), FRAMES_OUT)))
            x=x[idx]
        n_frames = x.size()[0]
        # FLATTENING ROWS BY TYPE and CONCATENATING TO ONE ROW PER FRAME 3D (XYZ)
        # INPUT NUMPY, TORCH OUTPUT

        # The Video contains n_frames each containing exactly ROWS_PER_FRAME (543)frames.
        # The frames in each are in order of feature type.
        # The rows conain x, y, z for a feature
        #   Video Format = [n_frames][543 frames][3 xyz coordinates]
        
        # Create views by data type (e.g. one point on hand) 
        # by selecting rows for each frame
        # face_x = x[:,:468,:].contiguous().view(-1, 468*3)
        lips_idx = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308, 95, 88, 178, 87, 14, 317, 402, 318, 324, 146, 91, 181, 84, 17, 314, 405, 321, 375]
        lips_x = x[:, lips_idx,:].contiguous().view(-1, len(lips_idx)*3)
        lefth_x = x[:,468:489,:].contiguous().view(-1, 21*3)
        pose_x = x[:,489:522,:].contiguous().view(-1, 33*3)
        righth_x = x[:,522:,:].contiguous().view(-1, 21*3)

        # Check for primary hand and if left hand swap and rotate hands
        if torch.isnan(lefth_x).sum() < torch.isnan(righth_x).sum():
            prime_x = lefth_x
            second_x = righth_x
        else:
            prime_x = righth_x.reshape(righth_x.size()[0], -1, DIMS)
            prime_x[:,:,0] = torch.from_numpy(np.subtract(np.nanmax(prime_x[:,:,0].numpy(), axis=1).reshape(-1,1),
                                    prime_x[:, :, 0].numpy()))
            prime_x = prime_x.reshape(prime_x.size()[0],-1)
            
            second_x = lefth_x.reshape(lefth_x.size()[0], -1, DIMS)
            second_x[:,:,0] = torch.from_numpy(np.subtract(np.nanmax(second_x[:,:,0].numpy(), axis=1).reshape(-1,1),
                                          second_x[:, :, 0].numpy()))
            second_x = second_x.reshape(second_x.size()[0],-1)
            
        
        # create video withfixed number of frames (FRAMES_OUT)
        # initialize with NoN so later operations can ignore them (e.g. nanmean()) 
        xfeat = torch.full([FRAMES_OUT, PTS_IN_FRAME], np.nan)
        
        # center frames
        offset = (FRAMES_OUT - n_frames) // 2  # center frames in output data in each frame in video
        
        # flatten types into one row per frame
        xfeat[offset:n_frames+offset,:] = torch.cat([lips_x, prime_x, pose_x, second_x], axis=1)  # concatenate types
        
        ############# CNN Specific
        ############# 
        
        def distDiff(ds, ref, pts):
            ds = ds.reshape(ds.shape[0],  -1, DIMS)
            d = torch.hstack([torch.from_numpy(np.nanmean(ds[:, pts, :].numpy(), axis=0)), 
                           torch.from_numpy( np.nanmedian(ds[:, pts, :].numpy(), axis=0)), 
                           torch.from_numpy(np.nanmax(ds[:, pts, :].numpy(), axis=0)), 
                           torch.from_numpy(np.nanmin(ds[:, pts, :].numpy(), axis=0)),
                           torch.from_numpy(np.nanvar(ds[:, pts, :].numpy(), axis=0))
                           ]) 
            d = d.reshape(1, -1) 
            # NORMALIZE
           # d = (d - np.nanmean(d, keepdims=True)) / np.nanstd(d, keepdims=True) # -1 to 1
            d = np.nan_to_num(d, copy=False)  # replace NaN after normalization
            return d
        
        d1 = distDiff(xfeat, 40, [40, 44, 48, 52, 56, 60, 43, 46, 50, 54, 58])
        d2 = distDiff(xfeat, 40, [40,98, 102, 106, 110, 114, 97, 102, 106, 110, 114])
        d3 = distDiff(xfeat, 60, [60, 73, 80, 81, 76, 77, 68, 69, 70, 71, 75, 74])
        d4 = distDiff(xfeat, 5,  [5,0, 4, 8, 12, 16, 20, 24, 28, 32, 36])
        
        return np.concatenate([d1,d2,d3,d4], axis=1)    


Feature Len 675 165 165 180 165


In [8]:
## PROCESS EACH ROW (ONE PARQUET PER ROW)
def convert_row(row):
    x = load_relevant_data_subset(os.path.join(BASE_DIR, row[1].path))
    x = feature_converter(torch.tensor(x))
    return x, row[1].label

## LOOP THROUGH PARQUET FILES LISTED IN TRAIN FILE
##  SAVE RESULTS 
def convert_and_save_data():
    label_map = json.load(open(f"{BASE_DIR}/sign_to_prediction_index_map.json", "r"))
    df = pd.read_csv(TRAIN_FILE)
    df['label'] = df['sign'].map(label_map)
    
    print("Convert&Save", df.shape)
    #### FOR TESTING #################
    #df = df[0:20]
    ##################################

    npdata = np.zeros((df.shape[0], CNN_FEAT_LEN))  

    nplabels = np.zeros(df.shape[0])
    
    results = map(convert_row, df.iterrows())
    for i, (x,y) in tqdm(enumerate(results), total=df.shape[0]):
            npdata[i,:] = x
            nplabels[i] = y
    return npdata, nplabels
 

feature_converter = FeatureGen()
datax, datay = convert_and_save_data()



Convert&Save (94477, 5)


100%|████████████████████████████████████| 94477/94477 [13:33<00:00, 116.08it/s]


In [None]:
# Save dataset
np.save(f"{ARCHIVE_DIR}/cnn_data{FRAMES_OUT}.npy", datax)
np.save(f"{ARCHIVE_DIR}/cnn_labels.npy", datay)
 

In [11]:
#MODEL
### NEW SEPARATED INPUTS
class ASLData(Dataset):
    def __init__(self,datax,datay):
        self.datax = datax
        self.datay = datay

    def __getitem__(self, index):
        return self.datax[index, :], self.datay[index]

    def __len__(self):
        return len(self.datay)

# https://towardsdatascience.com/pytorch-tabular-multiclass-classification-9f8211a123ab
class ASLModel(nn.Module):
    def __init__(self, p):
        super(ASLModel, self).__init__()
        
        # DATA in [1, CNN_FEAT_LEN] per video
        self.flatten = nn.Flatten()
        self.dropout = nn.Dropout(p)
        self.relu = nn.ReLU()

        L1OUT = 512  #1024 was ok
        L2OUT = 512

        self.layer_ph = nn.Linear(PR_LEN, L1OUT)
        self.batchnorm_ph = nn.BatchNorm1d(L1OUT)
        
        self.layer_sh = nn.Linear(SC_LEN, L1OUT)
        self.batchnorm_sh = nn.BatchNorm1d(L1OUT)
 
        self.layer_po = nn.Linear(PO_LEN, L1OUT)
        self.batchnorm_po = nn.BatchNorm1d(L1OUT)
 
        self.layer_li = nn.Linear(LI_LEN, L1OUT)
        self.batchnorm_li = nn.BatchNorm1d(L1OUT) 
 
        self.layer1 = nn.Linear(4*L1OUT, L2OUT)
        self.batchnorm1 = nn.BatchNorm1d(L2OUT)

        self.layerFC = nn.Linear(L2OUT, 250)
        self.softmax = nn.Softmax()
 
        
    def forward(self, x):
        phand = x[:,0:PR_LEN]            
        shand = x[:,PR_LEN:(PR_LEN+PR_LEN)]
        pose =  x[:,(PR_LEN+PR_LEN):(PR_LEN+PR_LEN+PO_LEN)]  
        lips =  x[:,(PR_LEN+PR_LEN+PO_LEN):CNN_FEAT_LEN]  
        
        ph = self.flatten(torch.tensor(phand).float()) 
        ph = self.layer_ph(ph)
        ph = self.batchnorm_ph(ph)
        ph = self.relu(ph)
        ph = self.dropout(ph)

        sh = self.flatten(torch.tensor(shand).float())       
        sh = self.layer_sh(sh)
        sh = self.batchnorm_sh(sh)
        sh = self.relu(sh)
        sh = self.dropout(sh)
       
        po = self.flatten(torch.tensor(pose).float())       
        po = self.layer_po(po)
        po = self.batchnorm_po(po)
        po = self.relu(po)
        po = self.dropout(po)
        
        li = self.flatten(torch.tensor(lips).float())       
        li = self.layer_li(li)
        li = self.batchnorm_li(li)
        li = self.relu(li)
        li = self.dropout(li)

        x = torch.cat((ph.view(ph.size(0), -1),
                       sh.view(sh.size(0), -1),
                       po.view(po.size(0), -1),
                       li.view(li.size(0), -1)), dim=1)
        # x = self.batchnorm0(x)
        x = self.layer1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = self.layerFC(x)
       # x = self.softmax(x)

        return x


In [12]:
## MULTI TRAINING
# !!! TRAINING DOES NOT RUN ON MAC OS - (cuda)
if torch.cuda.is_available():
  device = torch.device("cuda")
  print("++++using GPU++++")
else:
  device = torch.device("cpu")
  print("++++using CPU++++")

EPOCHS = 30
BATCH_SIZE = 64
start_time = time.perf_counter()

#datax = datax.reshape(datax.shape[0],datax.shape[1], -1) #.swapaxes(1,2)
#datax = torch.tensor(datax)  # Convert to Torch Tensor
datax = torch.tensor(datax)  # Convert to Torch Tensor

trainx, testx, trainy, testy = train_test_split(datax, datay, test_size=0.15, random_state=42)

# init list for saving predictions for ensemble processing
pred_list = pd.DataFrame(testy, columns=["truth"])

train_data = ASLData(trainx, trainy)
valid_data = ASLData(testx, testy)

train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=WORKERS, shuffle=True)
val_loader = DataLoader(valid_data, batch_size=BATCH_SIZE, num_workers=WORKERS, shuffle=False)
model = ASLModel(0.2).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()
sched = torch.optim.lr_scheduler.StepLR(opt, step_size=300, gamma=0.95)
for i in range(EPOCHS):
    model.train()
    
    train_loss_sum = 0.
    train_correct = 0
    train_total = 0
    train_bar = train_loader
    for x,y in train_bar:
        x = torch.Tensor(x).float().to(device)
        y = torch.Tensor(y).long().to(device) 
        y_pred = model(x)
        loss = criterion(y_pred, y)
        loss.backward()
        opt.step()
        opt.zero_grad()
        
        train_loss_sum += loss.item()
        train_correct += np.sum((np.argmax(y_pred.detach().cpu().numpy(), axis=1) == y.cpu().numpy()))
        train_total += 1
        sched.step()
        
    val_loss_sum = 0.
    val_correct = 0
    val_total = 0
    model.eval()
    for x,y in val_loader:
        x = torch.Tensor(x).float().to(device)
        y = torch.Tensor(y).long().to(device)
        
        with torch.no_grad():
            y_pred = model(x)
            loss = criterion(y_pred, y)
            val_loss_sum += loss.item()
            val_correct += np.sum((np.argmax(y_pred.cpu().numpy(), axis=1) == y.cpu().numpy()))
            val_total += 1
    print(f"DIM={DIMS} FRAMES={FRAMES_OUT}, FEAT={PTS_IN_FRAME}")                          
    print(f"Epoch:{i} > Train Loss: {(train_loss_sum/train_total):.04f}, Train Acc: {train_correct/len(train_data):0.04f}")
    print(f"Epoch:{i} > Val Loss: {(val_loss_sum/val_total):.04f}, Val Acc: {val_correct/len(valid_data):0.04f}")
    print("="*50)

# Save the pytorch model
PATH = f"{ARCHIVE_DIR}/models/modelccn{FRAMES_OUT}.sd"
torch.save(model.state_dict(), PATH)

x = testx.detach().numpy()
prob_cnn = model(torch.tensor(x)).detach().numpy()

pred_list['cnn'] = np.argmax(prob_cnn, axis=1)
print(pred_list.head())

with open(f"/content/drive/MyDrive/GaggleSignLang/pred_cnn.pkl", 'wb') as f1:
       pickle.dump(pred_cnn, f1)
with open(f"/content/drive/MyDrive/GaggleSignLang/prob_cnn.pkl", 'wb') as f1:
       pickle.dump(prob_cnn, f1)
print(prob_cnn.shape)


print("Accuracy:", np.mean(pred_list.truth == pred_list.cnn))
print("#### ELAPSED TIME:", time.perf_counter()-start_time)



++++using CPU++++
DIM=3 FRAMES=32, FEAT=345
Epoch:0 > Train Loss: 3.4657, Train Acc: 0.2211
Epoch:0 > Val Loss: 2.7225, Val Acc: 0.3496
DIM=3 FRAMES=32, FEAT=345
Epoch:1 > Train Loss: 2.3572, Train Acc: 0.4167
Epoch:1 > Val Loss: 2.0977, Val Acc: 0.4899
DIM=3 FRAMES=32, FEAT=345
Epoch:2 > Train Loss: 2.0062, Train Acc: 0.4925
Epoch:2 > Val Loss: 1.8331, Val Acc: 0.5407
DIM=3 FRAMES=32, FEAT=345
Epoch:3 > Train Loss: 1.7816, Train Acc: 0.5415
Epoch:3 > Val Loss: 1.7013, Val Acc: 0.5789
DIM=3 FRAMES=32, FEAT=345
Epoch:4 > Train Loss: 1.6070, Train Acc: 0.5849
Epoch:4 > Val Loss: 1.6092, Val Acc: 0.5973
DIM=3 FRAMES=32, FEAT=345
Epoch:5 > Train Loss: 1.4838, Train Acc: 0.6126
Epoch:5 > Val Loss: 1.5411, Val Acc: 0.6128
DIM=3 FRAMES=32, FEAT=345
Epoch:6 > Train Loss: 1.3765, Train Acc: 0.6388
Epoch:6 > Val Loss: 1.4306, Val Acc: 0.6447
DIM=3 FRAMES=32, FEAT=345
Epoch:7 > Train Loss: 1.2991, Train Acc: 0.6574
Epoch:7 > Val Loss: 1.3641, Val Acc: 0.6636
DIM=3 FRAMES=32, FEAT=345
Epoch:8 > Tr

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/GaggleSignLang/pred_cnn.pkl'