In [37]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import ast
import os
import datetime as dt
from tqdm import tqdm
import pandas as pd
import numpy as np
from joblib import Parallel, delayed 
import time 

In [38]:
def f2cat(filename: str) -> str:
    return filename.split('.')[0]

class Simplified():
    def __init__(self, input_path='./input'):
        self.input_path = input_path

    def list_all_categories(self):
        files = os.listdir(os.path.join(self.input_path, 'train_simplified'))
        return sorted([f2cat(f) for f in files], key=str.lower)

    def read_training_csv(self, category, nrows=None, usecols=None, drawing_transform=False):
        df = pd.read_csv(os.path.join(self.input_path, 'train_simplified', category + '.csv'),
                         nrows=nrows, parse_dates=['timestamp'], usecols=usecols)
        if drawing_transform:
            df['drawing'] = df['drawing'].apply(ast.literal_eval)
        return df

In [39]:
start = dt.datetime.now()
s = Simplified('/home/doodle/data/')
NCSVS = 100
categories = s.list_all_categories()
print(len(categories))

340


In [40]:
def write_training_file(y, cat):
    df = s.read_training_csv(cat)
    df['y'] = y
    df['cv'] = (df.key_id // 10 ** 7) % NCSVS
    for k in range(NCSVS):
        filename = 'full_train/train_k{}.csv'.format(k)
        chunk = df[df.cv == k]
        chunk = chunk.drop(['key_id'], axis=1)
        if y == 0:
            chunk.to_csv(filename, index=False)
        else:
            chunk.to_csv(filename, mode='a', header=False, index=False)
for y, cat in tqdm(enumerate(categories)):
    write_training_file(y, cat)
#Parallel(n_jobs=-1)(delayed(write_training_file)((y, cat)) for y, cat in enumerate(categories))
#print(time.time()-start)

340it [21:52,  4.00s/it]


In [41]:
def write_actual_file(k):
    filename = 'full_train/train_k{}.csv'.format(k)
    if os.path.exists(filename):
        df = pd.read_csv(filename)
        df['rnd'] = np.random.rand(len(df))
        df = df.sort_values(by='rnd').drop('rnd', axis=1)
        df.to_csv(filename + '.gz', compression='gzip', index=False)
        os.remove(filename)
for k in tqdm(range(NCSVS)):
    write_actual_file(k)

100%|██████████| 100/100 [4:01:10<00:00, 144.08s/it] 


In [16]:
end = dt.datetime.now()
print('Latest run {}.\nTotal time {}s'.format(end, (end - start).seconds))

Latest run 2018-11-06 05:55:26.263650.
Total time 256s
