# Pre-Processing Data Notebook
---

## Import Dependencies and Load Data

In [None]:
!pip install mediapipe

Library Requierment

In [1]:
import numpy as np
import pandas as pd
import mediapipe as mp
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

Google Mounting

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/capstone-project/asl-signs

Read CSV Data

In [3]:
df_expanded = pd.read_csv('train_expanded.csv')
df_expanded.head()

Unnamed: 0,path,participant_id,sequence_id,sign,start_frame,end_frame,total_frame
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow,20,42,23
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait,29,39,11
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud,103,207,105
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird,17,28,12
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie,22,39,18


## Function Helper and Global Variable

Getting list of lips landmark indexes

In [4]:
lips = set()
for elem in mp.solutions.face_mesh_connections.FACEMESH_LIPS:
    lips.add(elem[0])
    lips.add(elem[1])
np.array(lips)

array({0, 267, 269, 270, 13, 14, 17, 402, 146, 405, 409, 415, 291, 37, 39, 40, 178, 308, 181, 310, 311, 312, 185, 314, 317, 318, 61, 191, 321, 324, 78, 80, 81, 82, 84, 87, 88, 91, 95, 375},
      dtype=object)

### Global Variable

In [5]:
ROWS_PER_FRAME = 543 # Number of landmark each frame
SEQUENCE_LENGTH = 30 # 30 frame
NUM_LABELS = 250

lipsUpperOuter = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291]
lipsLowerOuter = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
lipsUpperInner = [78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308]
lipsLowerInner = [78, 95, 88, 178, 87, 14, 317, 402, 318, 324, 308]

# Landmark order each frame: IDX:0 | Lips -> Left Hand -> Pose -> Right Hand | IDX:543
lipsIDX = np.array(lipsUpperOuter + lipsLowerOuter + lipsUpperInner + lipsLowerInner)
lhIDX = 468
poseIDX = 489
rhIDX = 522

### Function Helper

* Generate Decoder and Encoder Labels

In [14]:
def extract_json(json_list):
  encode = {}
  decode = {}
  
  for id, word in enumerate(json_list):
    encode[word.lower()] = id
    decode[id] = word.lower()
  
  return encode, decode

* Landmark Extractor

In [6]:
def extract_keypoint(pq_path):
    data = pd.read_parquet(pq_path, columns=['x', 'y', 'z'])
    data.replace(np.nan, 0, inplace=True) # Imputing empty left/right hand landmark
    total_frame = int(len(data) / ROWS_PER_FRAME)
    
    landmarks = []
    sequences_interval = 1 if total_frame < SEQUENCE_LENGTH else total_frame // SEQUENCE_LENGTH
    
    for frame in range(0, SEQUENCE_LENGTH*sequences_interval, sequences_interval):
      try:
        boundary = ROWS_PER_FRAME * frame

        lips = np.array(data.iloc[lipsIDX + boundary]).flatten()
        lh = np.array(data.iloc[lhIDX+boundary : poseIDX+boundary]).flatten()
        pose = np.array(data.iloc[poseIDX+boundary : rhIDX+boundary]).flatten()
        rh = np.array(data.iloc[rhIDX+boundary : ROWS_PER_FRAME*(frame+1)]).flatten()

        landmarks.append(np.concatenate([lips,lh,pose,rh]))
      except:
        landmarks.append(np.zeros(354)) # Imputing empty frame

    return np.array(landmarks)

## Pre-Processing

In [7]:
# Test the function
extract_test = extract_keypoint(df_expanded['path'][0])
print(extract_test.shape)
del extract_test

(30, 354)


Create lables encoder and decoder

In [15]:
with open("sign_to_prediction_index_map.json", 'r') as json_file:
  json_data = json_file.read()
s2p_json = json.loads(json_data)

encode, decode = extract_json(s2p_json)
print(encode)
print(decode)

{'tv': 0, 'after': 1, 'airplane': 2, 'all': 3, 'alligator': 4, 'animal': 5, 'another': 6, 'any': 7, 'apple': 8, 'arm': 9, 'aunt': 10, 'awake': 11, 'backyard': 12, 'bad': 13, 'balloon': 14, 'bath': 15, 'because': 16, 'bed': 17, 'bedroom': 18, 'bee': 19, 'before': 20, 'beside': 21, 'better': 22, 'bird': 23, 'black': 24, 'blow': 25, 'blue': 26, 'boat': 27, 'book': 28, 'boy': 29, 'brother': 30, 'brown': 31, 'bug': 32, 'bye': 33, 'callonphone': 34, 'can': 35, 'car': 36, 'carrot': 37, 'cat': 38, 'cereal': 39, 'chair': 40, 'cheek': 41, 'child': 42, 'chin': 43, 'chocolate': 44, 'clean': 45, 'close': 46, 'closet': 47, 'cloud': 48, 'clown': 49, 'cow': 50, 'cowboy': 51, 'cry': 52, 'cut': 53, 'cute': 54, 'dad': 55, 'dance': 56, 'dirty': 57, 'dog': 58, 'doll': 59, 'donkey': 60, 'down': 61, 'drawer': 62, 'drink': 63, 'drop': 64, 'dry': 65, 'dryer': 66, 'duck': 67, 'ear': 68, 'elephant': 69, 'empty': 70, 'every': 71, 'eye': 72, 'face': 73, 'fall': 74, 'farm': 75, 'fast': 76, 'feet': 77, 'find': 78, '

### Spltting Data

**Splitting data composition:**
* 80% Train
* 10% Validation
* 10% Test

In [8]:
x = df_expanded['path']
x.to_frame()

Unnamed: 0,path
0,train_landmark_files/26734/1000035562.parquet
1,train_landmark_files/28656/1000106739.parquet
2,train_landmark_files/16069/100015657.parquet
3,train_landmark_files/25571/1000210073.parquet
4,train_landmark_files/62590/1000240708.parquet
...,...
94472,train_landmark_files/53618/999786174.parquet
94473,train_landmark_files/26734/999799849.parquet
94474,train_landmark_files/25571/999833418.parquet
94475,train_landmark_files/29302/999895257.parquet


In [21]:
# Encode lables to numerical
y = np.array([encode[sign.lower()] for sign in df_expanded['sign']])
y

array([ 25, 232,  48, ...,  86, 188, 105])

In [27]:
# One hot encode the numerical encoded lables
y_encode = to_categorical(y, num_classes=250)
print(y_encode.shape)
y_encode[0]

(94477, 250)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [41]:
x_train_path, x_rem, y_train, y_rem = train_test_split(x, y_encode, train_size=0.8, random_state=42, stratify=y_encode)

In [42]:
test_size = 0.5
x_val_path, x_test_path, y_val, y_test = train_test_split(x_rem, y_rem, test_size=0.5, random_state=42)

Checking the results

In [43]:
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

(75581, 250)
(9448, 250)
(9448, 250)


Save the split arrangement

In [44]:
np.save('x_train_path', x_train_path)
np.save('y_train', y_train)
np.save('x_val_path', x_val_path)
np.save('y_val', y_val)
np.save('x_test_path', x_test_path)
np.save('y_test', y_test)

# To free up memory
del x_train_path
del y_train
del x_val_path
del y_val
del x_test_path
del y_test

### Extract and Collect Keypoints

#### Validation Data

* Extract and collect keypoints

In [45]:
x_val = []
tmp = np.load('x_val_path.npy', allow_pickle=True)
size = len(tmp)
for i,path in enumerate(tmp):
  print(f"[{i+1}/{size}] {path}")
  x_val.append(extract_keypoint(path))

[1/9448] train_landmark_files/55372/4195908437.parquet
[2/9448] train_landmark_files/37055/1346688350.parquet
[3/9448] train_landmark_files/22343/1687373581.parquet
[4/9448] train_landmark_files/4718/2687745952.parquet
[5/9448] train_landmark_files/36257/1323568793.parquet
[6/9448] train_landmark_files/16069/1824773605.parquet
[7/9448] train_landmark_files/49445/2343988216.parquet
[8/9448] train_landmark_files/37055/2593747933.parquet
[9/9448] train_landmark_files/37055/3854077830.parquet
[10/9448] train_landmark_files/61333/3941332296.parquet
[11/9448] train_landmark_files/61333/32588190.parquet
[12/9448] train_landmark_files/25571/521674435.parquet
[13/9448] train_landmark_files/37779/2331604245.parquet
[14/9448] train_landmark_files/2044/174668283.parquet
[15/9448] train_landmark_files/26734/384588684.parquet
[16/9448] train_landmark_files/28656/3520660348.parquet
[17/9448] train_landmark_files/34503/4293790339.parquet
[18/9448] train_landmark_files/29302/2266628202.parquet
[19/9448

* Checking the results

In [None]:
np.array(x_val).shape

* Save the data

In [47]:
np.save('x_val.npy', x_val)
del x_val # To free up memory

#### Test Data

* Extract and collect keypoints

In [49]:
x_test = []
tmp = np.load('x_test_path.npy', allow_pickle=True)
size = len(tmp)
for i,path in enumerate(tmp):
  print(f"[{i+1}/{size}] {path}")
  x_test.append(extract_keypoint(path))

[1/9448] train_landmark_files/28656/3465245561.parquet
[2/9448] train_landmark_files/61333/2939056849.parquet
[3/9448] train_landmark_files/36257/953106424.parquet
[4/9448] train_landmark_files/16069/1785977275.parquet
[5/9448] train_landmark_files/34503/3740014621.parquet
[6/9448] train_landmark_files/49445/1482796882.parquet
[7/9448] train_landmark_files/26734/3994815984.parquet
[8/9448] train_landmark_files/37779/2479144697.parquet
[9/9448] train_landmark_files/26734/3301276626.parquet
[10/9448] train_landmark_files/28656/2862377573.parquet
[11/9448] train_landmark_files/28656/2472628787.parquet
[12/9448] train_landmark_files/22343/2480696946.parquet
[13/9448] train_landmark_files/55372/3857607017.parquet
[14/9448] train_landmark_files/37779/1970990625.parquet
[15/9448] train_landmark_files/36257/2565063756.parquet
[16/9448] train_landmark_files/2044/1747100059.parquet
[17/9448] train_landmark_files/28656/1625669037.parquet
[18/9448] train_landmark_files/28656/4091960937.parquet
[19

* Checking the results

In [50]:
np.array(x_test).shape

(9448, 30, 354)

* Save the data

In [51]:
np.save('x_test.npy', x_test)
del x_test # To free up memory

#### Train Data

In [53]:
tmp = np.load('x_train_path.npy', allow_pickle=True)
len(tmp)//2

37790

* **[Batch-1]** Extract and collect keypoints

In [54]:
x_train = []
size = len(tmp[:37790])
for i,path in enumerate(tmp[:37790]):
  print(f"[{i+1}/{size}] {path}")
  x_train.append(extract_keypoint(path))

[1/37790] train_landmark_files/27610/3998013432.parquet
[2/37790] train_landmark_files/61333/1893835731.parquet
[3/37790] train_landmark_files/62590/2923904125.parquet
[4/37790] train_landmark_files/37779/119691585.parquet
[5/37790] train_landmark_files/53618/331680263.parquet
[6/37790] train_landmark_files/61333/1394160429.parquet
[7/37790] train_landmark_files/61333/2483697876.parquet
[8/37790] train_landmark_files/27610/3690245245.parquet
[9/37790] train_landmark_files/16069/434673647.parquet
[10/37790] train_landmark_files/22343/749938182.parquet
[11/37790] train_landmark_files/2044/879367654.parquet
[12/37790] train_landmark_files/2044/3029179048.parquet
[13/37790] train_landmark_files/34503/2592345909.parquet
[14/37790] train_landmark_files/37779/3432058329.parquet
[15/37790] train_landmark_files/4718/3285494096.parquet
[16/37790] train_landmark_files/37779/4156073026.parquet
[17/37790] train_landmark_files/30680/3250337272.parquet
[18/37790] train_landmark_files/37779/2706330041

* **[Batch-1]** Checking the results

In [55]:
np.array(x_train).shape

(37790, 30, 354)

* **[Batch-1]** Save the data

In [56]:
np.save('x_train_1.npy', x_train)
del x_train

* **[Batch-2]** Extract and collect keypoint

In [57]:
x_train = []
size = len(tmp[37790:])
for i,path in enumerate(tmp[37790:]):
  print(f"[{i+1}/{size}] {path}")
  x_train.append(extract_keypoint(path))

[1/37791] train_landmark_files/29302/3110143557.parquet
[2/37791] train_landmark_files/2044/3636864402.parquet
[3/37791] train_landmark_files/53618/2688016421.parquet
[4/37791] train_landmark_files/53618/4260843258.parquet
[5/37791] train_landmark_files/25571/2544524466.parquet
[6/37791] train_landmark_files/53618/1568116885.parquet
[7/37791] train_landmark_files/55372/1115418609.parquet
[8/37791] train_landmark_files/61333/1416279173.parquet
[9/37791] train_landmark_files/32319/1089236343.parquet
[10/37791] train_landmark_files/16069/1716273074.parquet
[11/37791] train_landmark_files/37055/2190217137.parquet
[12/37791] train_landmark_files/36257/3853859789.parquet
[13/37791] train_landmark_files/18796/1096725717.parquet
[14/37791] train_landmark_files/61333/4293771291.parquet
[15/37791] train_landmark_files/28656/2673900188.parquet
[16/37791] train_landmark_files/55372/2014747562.parquet
[17/37791] train_landmark_files/55372/1049333044.parquet
[18/37791] train_landmark_files/55372/852

* **[Batch-2]** Checking the results

In [58]:
np.array(x_train).shape

(37791, 30, 354)

* **[Batch-2]** Checking the results

In [59]:
np.save('x_train_2.npy', x_train)
del x_train