In [18]:
import pandas as pd
from tqdm.notebook import tqdm

import multiprocessing
from multiprocessing import cpu_count

import json

import tensorflow as tf
import numpy as np
import os
import re
import random
import itertools as it
cpu_count()

10

In [19]:
## config stuff
DATA_DIR = '.'
WORK_DIR = '.'
CREATE_KAGGLE_DATASET = False
N_FOLDS = 5
CHUNK_SIZE = 256

N_WORKERS = cpu_count()

In [20]:
WORD_DICT = json.load(open(os.path.join(DATA_DIR, 'character_to_prediction_index.json')))


def match_current_files(root_data_dir: str) -> pd.DataFrame:
    """matches examples in training to the files in the training directory"""
    train = pd.read_csv(os.path.join(root_data_dir, 'train.csv'))

    # create the list of files that we can parse
    def parquet_match(x):
        match = re.match(r'(\d+)\.parquet', x)
        if match:
            return int(match.group(1))
    file_ids = set(map(parquet_match, os.listdir(os.path.join(root_data_dir, 'train_landmarks'))))

    return train[train.file_id.isin(file_ids)]

manifest = match_current_files(DATA_DIR)
manifest = manifest.groupby('file_id').sample(frac=1).reset_index(drop=True)

In [21]:
DATASET_NAME = f'ASLF-{N_FOLDS}fold'
DATASET_DIR = os.path.join(WORK_DIR, DATASET_NAME)

os.makedirs(DATASET_DIR, exist_ok=True)


if CREATE_KAGGLE_DATASET:
    with open('/kaggle/input/kaggleapi/kaggle.json') as f:
        kaggle_creds = json.load(f)
        
    os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
    os.environ['KAGGLE_KEY'] = kaggle_creds['key']
    
    !kaggle datasets init -p /tmp/{DATASET_NAME}
    
    with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
        dataset_meta = json.load(f)
    
    dataset_meta['id'] = f'jonathanpick/{DATASET_NAME}'
    dataset_meta['title'] = DATASET_NAME
    
    with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', 'w') as output:
        json.dump(dataset_meta, output)
    print(dataset_meta)
    
    !cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json
    !ls /tmp/{DATASET_NAME}

In [33]:
# encode and write w multiprocessing
N_FEATURES = 543

def encode_example(sequence: np.ndarray, frame: np.ndarray, user_id: int):
    feature = {
        'user_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[user_id])),
        'sequence': tf.train.Feature(bytes_list=tf.train.BytesList(value=[sequence.tobytes()])),
        'frame': tf.train.Feature(bytes_list=tf.train.BytesList(value=[frame.tobytes()]))
    }
    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()    

def decode_example(b):
    features = {
    'frame': tf.io.FixedLenFeature([], dtype=tf.dtypes.string),
    'sequence': tf.io.FixedLenFeature([], dtype=tf.dtypes.string)
    }
    decoded = tf.io.parse_single_example(b, features)
    frame = np.frombuffer(decoded['frame'].numpy()).reshape(-1, 543, 3)
    sequence = np.frombuffer(decoded['sequence'].numpy(), dtype=np.int64)
    return (sequence, frame)

class ParquetReader():
    """Parallel-processing reader for lists of parquet files"""
    def __init__(self, file_names):
        self.file_names = file_names
        self.current, self.next, self.index = pd.DataFrame([]), pd.DataFrame([]), pd.Index([])
        self.current_path, self.next_path = None, None
        
    def _swap_parquet(self):
        if self.current.empty:
            self.current_path = self.file_names.pop()
            self.current = pd.read_parquet(self.current_path)
            self.index = self.current.index.unique()
            if self.file_names:
                self.next_path = self.file_names.pop()
                self.next = pd.read_parquet(self.next_path) # TODO: eventually another process
        elif not self.next.empty:
            del self.current
            self.current = self.next
            self.index = self.current.index.unique()
            self.current_path = self.next_path
            if self.file_names:
                self.next_path = self.file_names.pop()
                self.next = pd.read_parquet(self.next_path)
            else:
                self.next = None
        else:
            return
        self.progress = tqdm(desc=f'reader {self.current_path}', total=len(self.index))
        
        
    def __getitem__(self, idx):
        if idx not in self:
            self._swap_parquet()
            assert idx in self # it should always be the case that idx is now in the index
        
        frame = self.current.loc[idx]
        self.progress.update(1)
        
        if type(frame) != pd.DataFrame:
            return None
        frame = frame.drop('frame', axis=1)
        
        frame = frame[sorted(frame.columns, key=lambda x: x[2:])] # group the columns by their feature in three coordinates (x,y,z)
        n_frames = len(frame)
        
        return frame.values.reshape(n_frames, N_FEATURES, 3) # we shape this into a frame x feature x axis tensor
    
    def __contains__(self, idx):
        if self.index.empty:
            return False
        else: return idx in self.index
    
    def __len__(self):
        return len(self.examples)

class ChunkIter():
    """Iterate through a list of chunk examples and produce coordinate matrices"""
    def __init__(self, chunk_list, parquet_reader):
        self.chunk_list = chunk_list
        self.parquet_reader = parquet_reader

    def __iter__(self):
        for chunk in self.chunk_list:
            chunk_seqs = []
            chunk_frames = []
            fold = chunk.iloc[0].fold
            for idx in chunk.index:
                example = chunk.loc[idx]
                seq_id = example.sequence_id
                seq = np.array(list(map(lambda x: WORD_DICT[x], example.phrase)))
                frame = self.parquet_reader[seq_id]

                chunk_frames += frame,
                chunk_seqs += seq,
            yield (chunk_frames, chunk_seqs), fold

class ParallelFoldWriter():
    """A processing class that reads and writes tfrecord files in parallel. It is a little bit tedious because we don't have 1:1 correspondence between parquet files and training examples.
    Yeah this might be a bit over-engineered. And yes, I know that it should use parallell processing instead of coroutines. That will be addressed when I am done with the project and have time to worry about these things."""
    def __init__(self, manifest: pd.DataFrame, read_dir: str, write_dir: str, n_folds: int, chunk_size: int, workers: int):
        # generate fold groups
        manifest = [df for _, df in manifest.groupby('file_id')]
        random.shuffle(manifest)
        manifest = pd.concat(manifest).reset_index()
        fold_size = len(manifest) // n_folds
        manifest['fold'] = manifest.index // chunk_size
        file_groups = list(map(lambda x: os.path.join(read_dir, x), manifest['path'].unique()))[::-1]

        # we will probably have a hanging portion in the last fold
        if len(manifest) % n_folds:
            manifest.loc[n_folds*fold_size:,'fold'] = n_folds - 1

        self.chunks = [c for _, c in manifest.groupby(lambda x: x // chunk_size)]
        self.write_dir = write_dir
        self.reader = ParquetReader(file_groups)
        self.workers = workers

    def _write_chunk(self, data):
        print('wrote a chunk')
        chunk, fold = data
        chunk_size = len(chunk)
        filename = os.path.join(self.write_dir, f'fold{fold}-{chunk_num}-{chunk_size}.tfrecord')
        options=tf.io.TFRecordOptions(compression_type='GZIP')
        writer = tf.io.TFRecordWriter(filename, options=options)
        for sequence, frame in chunk:
            encoded_bytes = encode_example(sequence, frame)
            writer.write(encoded_bytes)
        writer.close()
        
        
    def do_writes(self):
        chunk_iter = ChunkIter(self.chunks, self.reader)
        process_chunks = map(lambda x: it.islice(chunk_iter, x, x+self.workers), range(0, len(self.chunks), self.workers))
        for c in process_chunks:
            for chunk in c:
                p = multiprocessing.Process(target=self._write_chunk, args=chunk)
                p.start()
                break
                
            # process_pool = [multiprocessing.Process(target=self._write_chunk, args=(chunk, )).start() for chunk in c]
            # for process in process_pool:
                # process.join()





In [34]:
writer = ParallelFoldWriter(manifest, DATA_DIR, DATASET_DIR, N_FOLDS, CHUNK_SIZE, N_WORKERS)
writer.do_writes()

reader ./train_landmarks/1019715464.parquet:   0%|          | 0/998 [00:00<?, ?it/s]

TypeError: cannot pickle '_hashlib.HMAC' object

In [28]:
import gc
gc.collect()

0

In [None]:
!rm -rf /tmp/fold*

In [None]:
### TODO: move this to the colab notebook

# # from IPython.display import HTML
# import matplotlib.animation as animation
# from matplotlib.animation import FuncAnimation

# fig, ax = plt.subplots()

# def filter_nans(frames):
#     return 

# def plot_frame(frame, edges=[], indices=[]):
#     frame[np.isnan(frame)] = 0
#     x = list(frame[...,0])
#     y = list(frame[...,1])
#     if len(indices) == 0:
#         indices = list(range(len(x)))
#     ax.clear()
#     ax.scatter(x, y, color='blue')
#     for i in range(len(x)):
#         ax.text(x[i], y[i], indices[i])
    
#     ax.set_xticks([])
#     ax.set_yticks([])
#     ax.set_xticklabels([])
#     ax.set_yticklabels([])
    
# def animate_frames(frames, edges=[], indices=[]):
#         anim = FuncAnimation(fig, lambda frame: plot_frame(frame, edges, indices), frames=frames, interval=100)
#         return HTML(anim.to_jshtml())

In [None]:
DATASET_NAME = f'ASLF-{n_splits}fold'

os.makedirs(f'/tmp/{DATASET_NAME}', exist_ok=True)

with open('/kaggle/input/kaggleapi/kaggle.json') as f:
    kaggle_creds = json.load(f)
    
os.environ['KAGGLE_USERNAME'] = kaggle_creds['username']
os.environ['KAGGLE_KEY'] = kaggle_creds['key']

!kaggle datasets init -p /tmp/{DATASET_NAME}

with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json') as f:
    dataset_meta = json.load(f)

dataset_meta['id'] = f'jonathanpick/{DATASET_NAME}'
dataset_meta['title'] = DATASET_NAME

with open(f'/tmp/{DATASET_NAME}/dataset-metadata.json', 'w') as output:
    json.dump(dataset_meta, output)
print(dataset_meta)

!cp /tmp/{DATASET_NAME}/dataset-metadata.json /tmp/{DATASET_NAME}/meta.json
!ls /tmp/{DATASET_NAME}

In [None]:
writeGroup = ParallelFoldWriterGroup(manifest, f'/tmp/{DATASET_NAME}', 5, 256, 10)
await writeGroup.main_write_loop()

In [None]:
from datetime import datetime
version_name = datetime.now().strftime("%Y%m%d-%H%M%S")
print(version_name)

In [None]:
import boto3

settings = {
    'id': 'id',
    'secret': 'secret'
}

session = boto3.Session(
    aws_access_key_id=settings['id'],
    aws_secret_access_key=settings['secret'],
    region_name='us-west-1'
)

s3 = session.resource('s3')

In [None]:
bucket_name = DATASET_NAME.lower() + "-" + version_name
bucket = s3.create_bucket(
            Bucket=bucket_name,
            CreateBucketConfiguration={
                'LocationConstraint': 'us-west-1'
            }
        )

In [None]:
def upload_directory(bucket, directory):
    for root, dirs, files in os.walk(directory):
        for file in files:
            bucket.upload_file(os.path.join(root, file), file)
        

In [None]:
upload_directory(bucket, f'/tmp/{DATASET_NAME}')