In [None]:
import pandas as pd
from tqdm.notebook import tqdm
from functools import cmp_to_key

import multiprocessing
from multiprocessing import cpu_count

import json

import tensorflow as tf
import numpy as np
import os
import re
import random
import itertools as it
from datetime import datetime

In [None]:
## config
DATA_DIR = '/kaggle/input/asl-fingerspelling/'
WORK_DIR = '/tmp'
CREATE_KAGGLE_DATASET = True
N_FOLDS = 5
CHUNK_SIZE = 256
N_FEATURES = 543
N_WORKERS = cpu_count()

In [None]:
WORD_DICT = json.load(open(os.path.join(DATA_DIR, 'character_to_prediction_index.json')))
WORD_DICT_REV = {v:k for k, v in WORD_DICT.items()}


def match_current_files(root_data_dir: str) -> pd.DataFrame:
    """matches examples in training to the files in the training directory"""
    train = pd.read_csv(os.path.join(root_data_dir, 'train.csv'))

    # create the list of files that we can parse
    def parquet_match(x):
        match = re.match(r'(\d+)\.parquet', x)
        if match:
            return int(match.group(1))
    file_ids = set(map(parquet_match, os.listdir(os.path.join(root_data_dir, 'train_landmarks'))))

    return train[train.file_id.isin(file_ids)]

manifest = match_current_files(DATA_DIR)
manifest = manifest.groupby('file_id').sample(frac=1).reset_index(drop=True)

In [None]:
DATASET_NAME = f'ASLF-{N_FOLDS}fold'
DATASET_DIR = os.path.join(WORK_DIR, DATASET_NAME)

os.makedirs(DATASET_DIR, exist_ok=True)


if CREATE_KAGGLE_DATASET:
    with open('/kaggle/input/kaggleapi/kaggle.json') as f:
        kaggle_creds = json.load(f)
        
    os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
    os.environ['KAGGLE_KEY'] = kaggle_creds['key']
    
    os.system(f'kaggle datasets init -p /tmp/{DATASET_NAME}')
    
    with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
        dataset_meta = json.load(f)
    
    dataset_meta['id'] = f'jonathanpick/{DATASET_NAME}'
    dataset_meta['title'] = DATASET_NAME
    
    with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', 'w') as output:
        json.dump(dataset_meta, output)
    print(dataset_meta)
    
    os.system(f'cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json')
    os.system(f'ls /tmp/{DATASET_NAME}')

In [None]:
# encode and write w multiprocessing
def encode_example(sequence: np.ndarray, frame: np.ndarray):
    feature = {
        'sequence': tf.train.Feature(bytes_list=tf.train.BytesList(value=[sequence.tobytes()])),
        'frame': tf.train.Feature(bytes_list=tf.train.BytesList(value=[frame.tobytes()]))
    }
    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString() 

def decode_example(b):
    features = {
    'frame': tf.io.FixedLenFeature([], dtype=tf.dtypes.string),
    'sequence': tf.io.FixedLenFeature([], dtype=tf.dtypes.string)
    }
    decoded = tf.io.parse_single_example(b, features)
    decoded['frame'] = tf.reshape(tf.io.decode_raw(decoded['frame'], tf.dtypes.float32), (-1, N_FEATURES, 3))
    decoded['sequence'] = tf.io.decode_raw(decoded['sequence'], tf.dtypes.int64)
    return decoded

class ParquetReader():
    """Parallel-processing reader for lists of parquet files"""
    def __init__(self, file_names):
        self.file_names = file_names
        self.current, self.next, self.index = pd.DataFrame([]), pd.DataFrame([]), pd.Index([])
        self.current_path, self.next_path = None, None
        
    def _swap_parquet(self):
        if self.current.empty:
            self.current_path = self.file_names.pop()
            self.current = pd.read_parquet(self.current_path)
            self.index = self.current.index.unique()
            if self.file_names:
                self.next_path = self.file_names.pop()
                self.next = pd.read_parquet(self.next_path) # TODO: eventually another process
        elif not self.next.empty:
            del self.current
            self.current = self.next
            self.index = self.current.index.unique()
            self.current_path = self.next_path
            if self.file_names:
                self.next_path = self.file_names.pop()
                self.next = pd.read_parquet(self.next_path)
            else:
                self.next = None
        else:
            return
        self.progress = tqdm(desc=f'reader {self.current_path}', total=len(self.index))
        
        
    def __getitem__(self, idx):
        if idx not in self:
            prev_path = self.current_path
            self._swap_parquet()
            try:
                assert idx in self # it should always be the case that idx is now in the index
            except AssertionError:
                print(f'Failed to find example {idx}')
                print(prev_path, self.current_path)
                raise AssertionError
        
        frame = self.current.loc[idx]
        self.progress.update(1)
        
        if type(frame) != pd.DataFrame:
            return None
        frame = frame.drop('frame', axis=1)
        
        frame = frame[sorted(frame.columns, key=lambda x: x[2:])] # group the columns by their feature in three coordinates (x,y,z)
        n_frames = len(frame)
        
        return frame.values.reshape(n_frames, N_FEATURES, 3) # we shape this into a frame x feature x axis tensor
    
    def __contains__(self, idx):
        if self.index.empty:
            return False
        else: return idx in self.index
    
    def __len__(self):
        return len(self.examples)
    
def cmp(lh, rh):
    """Sorting column names in dataframe by the following criteria
    
    1. feature (hand, pose, etc.)
    2. feature number
    3. coordinate (x, y, z)
    """
    rxl = re.search(r'^([xyz])_([a-z_]+)_([0-9]+)$', lh)
    rxr = re.search(r'^([xyz])_([a-z_]+)_([0-9]+)$', rh)
    if rxl.group(2) < rxr.group(2):
        return -1
    elif rxl.group(2) == rxr.group(2):
        if rxl.group(3) == rxr.group(3):
            if rxl.group(1) < rxr.group(1):
                return -1
            elif rxl.group(1) == rxr.group(1):
                return 0
            else:
                return 1
        else:
            l = int(rxl.group(3))
            r = int(rxr.group(3))
            if l < r:
                return -1
            elif l == r:
                return 0
            else:
                return 1
    else:
        return 1

class ChunkIter():
    """Iterate through a list of chunk examples and produce coordinate matrices"""
    def __init__(self, chunk_list, read_dir):
        self.chunk_list = chunk_list
        self.read_dir = read_dir
        
        self.current_parquet, self.file_id =  None, None

    def __iter__(self):
        for chunk in self.chunk_list:
            chunk_seqs = []
            chunk_frames = []
            fold = chunk.iloc[0].fold
            for idx in chunk.index:
                row = chunk.loc[idx]
                seq = np.array(list(map(lambda x: WORD_DICT[x], row.phrase)))
                frame = self._read_parquet(row)

                if type(frame) != np.ndarray:
                    continue
                chunk_frames += frame,
                chunk_seqs += seq,
            yield (chunk_frames, chunk_seqs), fold            
    
    def _swap_parquet(self):
        """Implement if we find it necessary to do parquet background process loading. Currently not implemented"""
        pass
#         if self.current.empty:
#             self.current_path = self.file_names.pop()
#             self.current = pd.read_parquet(self.current_path)
#             self.index = self.current.index.unique()
#             if self.file_names:
#                 self.next_path = self.file_names.pop()
#                 self.next = pd.read_parquet(self.next_path) # TODO: eventually another process
#         elif not self.next.empty:
#             del self.current
#             self.current = self.next
#             self.index = self.current.index.unique()
#             self.current_path = self.next_path
#             if self.file_names:
#                 self.next_path = self.file_names.pop()
#                 self.next = pd.read_parquet(self.next_path)
#             else:
#                 self.next = None
#         else:
#             return
#         self.progress = tqdm(desc=f'reader {self.current_path}', total=len(self.index))
        
    def _read_parquet(self, row):
        idx = row.sequence_id
        file_id = row.file_id
        
        if file_id != self.file_id:
            del self.current_parquet
            self.current_parquet = pd.read_parquet(os.path.join(self.read_dir, row.path))
            self.file_id = file_id
            self.progress = tqdm(desc=f'reader for parquet {self.file_id}', total=len(self.current_parquet.index.unique()))
        
        frame = self.current_parquet.loc[idx]
        self.progress.update(1)
        if type(frame) != pd.DataFrame:
            return None
        frame = frame.drop('frame', axis=1)
        frame = frame[sorted(frame.columns, key=cmp_to_key(cmp))]
        n_frames = len(frame)
        
        return frame.values.reshape(n_frames, N_FEATURES, 3) # we shape this into a frame x feature x axis tensor
    
class ProcessChunkIter():
    def __init__(self, chunk_iter, n_workers):
        self.chunk_iter = chunk_iter
        self.chunk_length = n_workers
        
    def __iter__(self):
        to_process = []
        for i, chunk in enumerate(self.chunk_iter):
            if i % self.chunk_length == 0 and i != 0:
                yield to_process
                to_process = []
            to_process.append(chunk)
        
        if to_process: yield to_process
        

def write_chunk(data, write_dir, chunk_num):
        (frames, seqs), fold = data
        chunk_size = len(frames)
        filename = os.path.join(write_dir, f'fold{fold}-{chunk_num}-{chunk_size}.tfrecord')
        options=tf.io.TFRecordOptions(compression_type='GZIP')
        writer = tf.io.TFRecordWriter(filename, options=options)
        for frame, sequence in zip(frames, seqs):
            encoded_bytes = encode_example(sequence, frame)
            writer.write(encoded_bytes)
        writer.close()

class ParallelFoldWriter():
    """A processing class that reads and writes tfrecord files in parallel. It is a little bit tedious because we don't have 1:1 correspondence between parquet files and training examples. Yeah this might be a bit over-engineered."""
    def __init__(self, manifest: pd.DataFrame, read_dir: str, write_dir: str, n_folds: int, chunk_size: int, workers: int):
        # generate fold groups
        self.manifest = [df for _, df in manifest.groupby('file_id')]
        random.shuffle(self.manifest)
        self.manifest = pd.concat(self.manifest).reset_index()
        fold_size = len(self.manifest) // n_folds
        self.manifest['fold'] = self.manifest.index // fold_size

        # we will probably have a hanging portion in the last fold
        if len(self.manifest) % n_folds:
            self.manifest.loc[n_folds*fold_size:,'fold'] = n_folds - 1

        self.chunks = [c for _, c in self.manifest.groupby(lambda x: x // chunk_size)]
        self.read_dir = read_dir
        self.write_dir = write_dir
        self.workers = workers
        
    def do_writes(self):
        chunk_iter = ChunkIter(self.chunks, self.read_dir)
        process_chunks = ProcessChunkIter(chunk_iter, self.workers)
        for i, c in enumerate(process_chunks):
            pool = []
            assert len(c) <= self.workers
            for j, chunk in enumerate(c):
                chunk_num = i*self.workers + j
                pool.append(multiprocessing.Process(target=write_chunk, args=(chunk, self.write_dir, chunk_num)).start())
            for process in pool:
                if process != None:
                    process.join()

In [None]:
writer = ParallelFoldWriter(manifest, DATA_DIR, DATASET_DIR, N_FOLDS, CHUNK_SIZE, N_WORKERS)
writer.do_writes()

In [None]:
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
bucket_name = 'gs://' + DATASET_NAME.lower() + "-" + version_name
!mkdir /tmp/{DATASET_NAME}
!echo "test-text" > /tmp/{DATASET_NAME}/test

!gcloud auth activate-service-account --key-file /kaggle/input/gcloud/optical-genre-394019-e8f0bc3608c7.json
!gcloud config set project optical-genre-394019 -q
!gcloud storage buckets create {bucket_name} --location us-west1 --pap
!gcloud storage cp /tmp/{DATASET_NAME}/* {bucket_name}