## **This module is for parsing the `WLASL_v03.json` file and downloading the .mp4 files for each word. Also, the .mp4 files are converted into `np arrays`**

In [1]:
import numpy as np
import pandas as pd
import json
import os

## Loading the json file as an `pd` DataFrame

In [10]:
main_path = 'dataset_folder'
file_name = 'WLASL_v0.3.json'
file_path = os.path.join(main_path, file_name)
wlasl_df = pd.read_json(file_path)

print("json shpe: "+ str(wlasl_df.shape))
wlasl_df.head()

json shpe: (2000, 2)


Unnamed: 0,gloss,instances
0,book,"[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra..."
1,drink,"[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f..."
2,computer,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
3,before,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."
4,chair,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_..."


## Fetching available videos list from the dataset

function to check if the video id is available in the dataset and return the viedos ids of the current instance
    
    input: instance json list
    output: list of videos_ids

In [26]:
def get_videos_ids(json_list):
    videos_list = []    
    for ins in json_list:
        video_id = ins['video_id']
        videos_list.append(video_id)
    return videos_list

function to check if the video id is available in the dataset and return the viedos ids and url or any other featrue of the current instance
    
    input: instance json list
    output: list of videos_ids

In [28]:
def get_json_features(json_list):
    
    videos_ids = []
    videos_urls = []
    for ins in json_list:
        video_id = ins['video_id']
        video_url = ins['url']
        videos_ids.append(video_id)
        videos_urls.append(video_url)
    return videos_ids, videos_urls

In [14]:
with open(f'{main_path}/{file_name}', 'r') as data_file:
    json_data = data_file.read()

instance_json = json.loads(json_data)

In [20]:
instance_json[0]['instances']

[{'bbox': [385, 37, 885, 720],
  'fps': 25,
  'frame_end': -1,
  'frame_start': 1,
  'instance_id': 0,
  'signer_id': 118,
  'source': 'aslbrick',
  'split': 'train',
  'url': 'http://aslbricks.org/New/ASL-Videos/book.mp4',
  'variation_id': 0,
  'video_id': '69241'},
 {'bbox': [190, 25, 489, 370],
  'fps': 25,
  'frame_end': -1,
  'frame_start': 1,
  'instance_id': 1,
  'signer_id': 90,
  'source': 'aslsignbank',
  'split': 'train',
  'url': 'https://aslsignbank.haskins.yale.edu/dictionary/protected_media/glossvideo/ASL/BO/BOOK-418.mp4',
  'variation_id': 0,
  'video_id': '65225'},
 {'bbox': [262, 1, 652, 480],
  'fps': 25,
  'frame_end': -1,
  'frame_start': 1,
  'instance_id': 2,
  'signer_id': 110,
  'source': 'valencia-asl',
  'split': 'train',
  'url': 'https://www.youtube.com/watch?v=0UsjUE-TXns',
  'variation_id': 0,
  'video_id': '68011'},
 {'bbox': [123, 19, 516, 358],
  'fps': 25,
  'frame_end': 60,
  'frame_start': 1,
  'instance_id': 3,
  'signer_id': 113,
  'source': 'lil

In [27]:
get_videos_ids(instance_json[0]['instances'])[0]

'69241'

In [31]:
wlasl_df['videos_ids'] = wlasl_df['instances'].apply(get_videos_ids)

In [35]:
features_df = pd.DataFrame(columns=['gloss', 'video_id', 'url'])

for row in wlasl_df.iterrows():
    ids, urls = get_json_features(row[1][1])
    word = [row[1][0]] * len(ids)
    df = pd.DataFrame(list(zip(word, ids, urls)), columns = features_df.columns)
    features_df = features_df.append(df, ignore_index=True)

  features_df = features_df.append(df, ignore_index=True)


From the json file, only 3 items are useful for now, therefore extracring them in a `features_df` dataframe

In [37]:
features_df.index.name = 'index'
features_df

Unnamed: 0_level_0,gloss,video_id,url
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,book,69241,http://aslbricks.org/New/ASL-Videos/book.mp4
1,book,65225,https://aslsignbank.haskins.yale.edu/dictionar...
2,book,68011,https://www.youtube.com/watch?v=0UsjUE-TXns
3,book,68208,https://www.youtube.com/watch?v=1QOYOZ3g-aY
4,book,68012,https://www.youtube.com/watch?v=aGtIHKEdCds
...,...,...,...
21078,whistle,63186,https://media.spreadthesign.com/video/mp4/13/9...
21079,whistle,63187,https://www.handspeak.com/word/w/whistle.mp4
21080,whistle,63188,https://www.signingsavvy.com/signs/mp4/9/9961.mp4
21081,whistle,63189,http://www.aslpro.com/main/w/whistle.swf


In [38]:
features_df.to_csv(f'{main_path}/features_df.csv', index=False)

In [71]:
all_words = features_df['gloss'].unique()
all_words[:10]

array(['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes',
       'who', 'candy', 'cousin'], dtype=object)

Good! Now that the `df` is loaded, let's explore the classes

## Data exploration

In [41]:
wlasl_df['samples_num'] = wlasl_df['videos_ids'].apply(len)

In [42]:
wlasl_df.head()

Unnamed: 0,gloss,instances,videos_ids,samples_num
0,book,"[{'bbox': [385, 37, 885, 720], 'fps': 25, 'fra...","[69241, 65225, 68011, 68208, 68012, 70212, 702...",40
1,drink,"[{'bbox': [551, 68, 1350, 1080], 'fps': 25, 'f...","[69302, 65539, 70173, 68538, 68042, 68660, 680...",35
2,computer,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[12306, 68028, 69054, 12328, 12329, 12330, 123...",30
3,before,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[05724, 70348, 68007, 05744, 05746, 05728, 057...",26
4,chair,"[{'bbox': [0, 0, 360, 240], 'fps': 25, 'frame_...","[09847, 70230, 68580, 70263, 68019, 09865, 098...",26


In [50]:
print("MIN samples per word:", wlasl_df['samples_num'].min())
print("MAX samples per word:", wlasl_df['samples_num'].max())

MIN samples per word: 6
MAX samples per word: 40


In [55]:
words_sample_counts = wlasl_df[['gloss', 'samples_num']].groupby('samples_num').agg({"gloss":['count', ', '.join]})
words_sample_counts

Unnamed: 0_level_0,gloss,gloss
Unnamed: 0_level_1,count,join
samples_num,Unnamed: 1_level_2,Unnamed: 2_level_2
6,1,caterpillar
7,402,"complete, shoot, united states, accent, act, a..."
8,317,"responsibility, a, a lot, abdomen, able, accou..."
9,244,"cost, diarrhea, ocean, thermometer, above, acc..."
10,233,"river, across, actor, agree, alarm, allergy, a..."
11,173,"exchange, add, airplane, already, also, analyz..."
12,159,"accept, adult, after, ago, allow, america, ang..."
13,121,"afternoon, age, alone, appointment, australia,..."
14,94,"always, animal, argue, baby, back, bake, bath,..."
15,77,"example, about, approve, arrive, balance, bana..."


## Converting Video files into np.arrays of features

**Pipeline:**

    1. Extracting frames from the video files using `OpenCV`
    2. Converting the frames into mp.hollistic keypoints
    3. Storing each set of keypoints for each frame in a designated video

Logic  for getting frames for each video `i` in `features_df['url'][0]`

In [58]:
import cv2

# Open the video file
video = cv2.VideoCapture(features_df['url'][0])

# Get the frame rate of the video
frame_rate = int(round(video.get(cv2.CAP_PROP_FPS)))

# Set the desired frame rate (in this case, 10 fps)
desired_frame_rate = 10

# Set the frame interval to achieve the desired frame rate
frame_interval = frame_rate // desired_frame_rate

# Initialize variables for the loop
success, image = video.read()
count = 0

# Loop through the video frames and extract frames at the desired frame rate
while success:
    # for each frame after this particular interval
    if count % frame_interval == 0:
        cv2.imwrite("frame%d.jpg" % count, image)
    success, image = video.read()
    count += 1

# Release the video object
video.release()

In [75]:
actions = np.array(all_words[0], dtype='object')
actions

array('book', dtype=object)

In [79]:
import mediapipe as mp
import cv2

mp_holistic = mp.solutions.holistic
mp_rendering = mp.solutions.drawing_utils

model = mp_holistic.Holistic(min_detection_confidence=0.7, min_tracking_confidence=0.5)

In [114]:
def extract_keypoints(results):
    pose = np.array([[res.x, res.y, res.z, res.visibility] for res in results.pose_landmarks.landmark]).flatten() if results.pose_landmarks else np.zeros(33*4)
    face = np.array([[res.x, res.y, res.z] for res in results.face_landmarks.landmark]).flatten() if results.face_landmarks else np.zeros(468*3)
    lh = np.array([[res.x, res.y, res.z] for res in results.left_hand_landmarks.landmark]).flatten() if results.left_hand_landmarks else np.zeros(21*3)
    rh = np.array([[res.x, res.y, res.z] for res in results.right_hand_landmarks.landmark]).flatten() if results.right_hand_landmarks else np.zeros(21*3)
    return np.concatenate([pose, face, lh, rh])

def mediapipe_detection(IMAGE, MODEL):
    # image = cv2.cvtColor(IMAGE, cv2.COLOR_BGR2RGB)
    image.flags.writeable = False
    results = MODEL.process(image)
    image.flags.writeable = True
    # image = cv2.cvtColor(IMAGE, cv2.COLOR_RGB2BGR)
    return image, results

def render_landmarks(image, results):
    mp_rendering.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION , 
                             mp_rendering.DrawingSpec(color=(80,110,10), thickness=1, circle_radius=1), 
                             mp_rendering.DrawingSpec(color=(80,256,121), thickness=1, circle_radius=1)
                             ) 
    mp_rendering.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS,
                             mp_rendering.DrawingSpec(color=(80,22,10), thickness=1, circle_radius=4), 
                             mp_rendering.DrawingSpec(color=(80,44,121), thickness=1, circle_radius=2)
                             ) 
    mp_rendering.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_rendering.DrawingSpec(color=(121,22,76), thickness=1, circle_radius=4), 
                             mp_rendering.DrawingSpec(color=(121,44,250), thickness=1, circle_radius=2)
                             ) 
    mp_rendering.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                             mp_rendering.DrawingSpec(color=(245,117,66), thickness=1, circle_radius=4), 
                             mp_rendering.DrawingSpec(color=(245,66,230), thickness=1, circle_radius=2)
                             ) 

In [87]:
# for 1st url, FOR NOW
cap = cv2.VideoCapture(features_df['url'][0])
# dir for all images
image_dir = os.path.join('data', 'Images')

action = 'book'
# |__>delete with below line
# for action in actions:
for i in range(1):
#    render frame from video
    success, frame = cap.read()
#    replace frame with hollistic image
    image, results = mediapipe_detection(frame, model)
    
#     makedir for the word
    os.makedirs(os.path.join(image_dir, str(action)), exist_ok=True)
    images_actions_dir = os.path.join(image_dir, str(action))
    count = 0    
    # Loop through the video frames and extract frames at the desired frame rate
    while success:
        # for each frame after this particular interval
        if count % frame_interval == 0:
            cv2.imwrite(f"{images_actions_dir}/frame{count}.jpg", image)
        success, frame = cap.read()
        # image, results = mediapipe_detection(frame, model)
        count += 1
    

    # # Keypoint extraction and saving
    # keypoints = extract_keypoints(results)
    # npy_path = os.path.join(path, action, str(sequence), str(frame_num))
    # np.save(npy_path, keypoints)

cap.release()
cv2.destroyAllWindows()

Saves all the frames in designated foler (without keypoints) 

Downloading 1st hundred words

In [142]:
sum = 0
for i in range(100):
    sum += wlasl_df['samples_num'][i]
samples_of_1st_hundred_words = sum
print(samples_of_1st_hundred_words)

2038


In [137]:
%%timeit
import cv2

last_gloss = 'book'
image_dir = os.path.join('data', 'Images')
video_subfile_counter = 0

for i in range(samples_of_1st_hundred_words):
    
    action = features_df['gloss'][i]
    if action == last_gloss:
        video_subfile_counter +=1
    else:
        last_gloss = action
        video_subfile_counter = 1
    # Open the video file
    video = cv2.VideoCapture(features_df['url'][i])

    # Initialize variables for the loop
    success, image = video.read()
    count = 0

    os.makedirs(os.path.join(image_dir, str(action)), exist_ok=True)
    images_actions_dir = os.path.join(image_dir, str(action))
    os.makedirs(os.path.join(images_actions_dir, f'video{video_subfile_counter}'), exist_ok=True)


    # Loop through the video frames and extract frames at the desired frame rate
    while success:
    #     my addition    
        if count % 3 == 0:
            image, results = mediapipe_detection(frame, model)
            # render_landmarks(image, results)
            # plt.imshow(image)
            keypoints = extract_keypoints(results)
            npy_path = os.path.join(images_actions_dir, str(f'video{video_subfile_counter}'), str(count))
            np.save(npy_path, keypoints)
        success, image = video.read()
        count += 1

    #     add 1 image in each folder 'video i ' for confirmation
        # cv2.imwrite(f"{os.path.join(images_actions_dir, str('video1'))}/frame%d.jpg" % count, image)
    # Release the video object
    video.release()

KeyboardInterrupt: 

In [110]:
np.load(os.path.join(images_actions_dir, 'video1', "{}.npy".format(6))).shape

(1662,)

In [100]:
action

'book'

In [121]:
features_df['gloss'][40]

'drink'

In [130]:
len(all_words)


2000

Collecting all the actions into a variable 

In [139]:
all_words

array(['book', 'drink', 'computer', ..., 'weigh', 'wheelchair', 'whistle'],
      dtype=object)

In [140]:
actions = np.array(all_words[:100])
actions

array(['book', 'drink', 'computer', 'before', 'chair', 'go', 'clothes',
       'who', 'candy', 'cousin', 'deaf', 'fine', 'help', 'no', 'thin',
       'walk', 'year', 'yes', 'all', 'black', 'cool', 'finish', 'hot',
       'like', 'many', 'mother', 'now', 'orange', 'table', 'thanksgiving',
       'what', 'woman', 'bed', 'blue', 'bowling', 'can', 'dog', 'family',
       'fish', 'graduate', 'hat', 'hearing', 'kiss', 'language', 'later',
       'man', 'shirt', 'study', 'tall', 'white', 'wrong', 'accident',
       'apple', 'bird', 'change', 'color', 'corn', 'cow', 'dance', 'dark',
       'doctor', 'eat', 'enjoy', 'forget', 'give', 'last', 'meet', 'pink',
       'pizza', 'play', 'school', 'secretary', 'short', 'time', 'want',
       'work', 'africa', 'basketball', 'birthday', 'brown', 'but',
       'cheat', 'city', 'cook', 'decide', 'full', 'how', 'jacket',
       'letter', 'medicine', 'need', 'paint', 'paper', 'pull', 'purple',
       'right', 'same', 'son', 'tell', 'thursday'], dtype=object

In [141]:
%store actions

Stored 'actions' (ndarray)
