In [None]:
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import os
import ast
import datetime as dt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]
plt.rcParams['font.size'] = 14
import seaborn as sns
import cv2
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation
from tensorflow.keras.metrics import categorical_accuracy, top_k_categorical_accuracy, categorical_crossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications import NASNetMobile, MobileNet
from tensorflow.keras.applications.mobilenet import preprocess_input
from tensorflow.keras.models import load_model
from joblib import Parallel, delayed
from functools import partial
from os import listdir 
import time
import pickle 
import numpy as np
from tqdm import tqdm
from scipy import sparse

In [15]:
DP_DIR = "../../data/"
BASE_SIZE = 256
NCATS = 340

In [None]:
def draw_cv2(raw_strokes, size=256, lw=6, time_color=True):
    img = np.zeros((BASE_SIZE, BASE_SIZE), np.uint8)
    for t, stroke in enumerate(raw_strokes):
        for i in range(len(stroke[0]) - 1):
            color = 255 - min(t, 10) * 13 if time_color else 255
            _ = cv2.line(img, (stroke[0][i], stroke[1][i]),
                         (stroke[0][i + 1], stroke[1][i + 1]), color, lw)
    if size != BASE_SIZE:
        return cv2.resize(img, (size, size))
    else:
        return img

def get_input(df, size, lw, time_color):
    df['drawing'] = df['drawing'].apply(ast.literal_eval)
    x = np.zeros((len(df), size, size, 1))
    for i, raw_strokes in enumerate(df.drawing.values):
        x[i, :, :, 0] = draw_cv2(raw_strokes, size=size, lw=lw,
                                 time_color=time_color)
    x = preprocess_input(x).astype(np.float32)
    y = keras.utils.to_categorical(df.y, num_classes=NCATS)
    return x, y
    
def image_generator_xd(size, batchsize, ks, lw=6, time_color=True):
    partial_get_input = partial(get_input, size=size, lw=lw, time_color=time_color)
    while True:
        for k in np.random.permutation(ks):
            filename = os.path.join(DP_DIR, 'train_k{}.csv.gz'.format(k))
            tmp_chunks = []
            for chunk in pd.read_csv(filename, chunksize=batchsize, nrows=batchsize*8):
                tmp_chunks.append(chunk)
                if len(tmp_chunks) == 8:
                    inputs = Parallel(n_jobs=1)(delayed(partial_get_input)(chunk) for chunk in tmp_chunks)
                    tmp_chunks = [] 
                    for inp in inputs:
                        yield inp
            inputs = Parallel(n_jobs=1)(delayed(partial_get_input)(chunk) for chunk in tmp_chunks)
            for inp in inputs:
                yield inp




In [None]:
train_datagen = image_generator_xd(size=128, batchsize=10000, ks=99)

In [66]:
start = time.time()
for i in tqdm(range(100)):
    a = next(train_datagen)
    with open("partition_"+str(i)+".npy", 'wb') as of:
        pickle.dump(a, of)
        

print(time.time()-start)


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:25<41:53, 25.39s/it][A
  2%|▏         | 2/100 [00:26<29:28, 18.05s/it][A
  3%|▎         | 3/100 [00:27<20:51, 12.90s/it][A
  4%|▍         | 4/100 [00:28<14:53,  9.30s/it][A
  5%|▌         | 5/100 [00:29<10:45,  6.80s/it][A
  6%|▌         | 6/100 [00:29<07:53,  5.04s/it][A
  7%|▋         | 7/100 [00:30<05:54,  3.81s/it][A
  8%|▊         | 8/100 [00:31<04:30,  2.94s/it][A
  9%|▉         | 9/100 [00:57<14:49,  9.78s/it][A
 10%|█         | 10/100 [00:58<10:44,  7.16s/it][A
 11%|█         | 11/100 [00:59<07:51,  5.30s/it][A
 12%|█▏        | 12/100 [01:00<05:50,  3.98s/it][A
 13%|█▎        | 13/100 [01:01<04:25,  3.06s/it][A
 14%|█▍        | 14/100 [01:02<03:27,  2.41s/it][A
 15%|█▌        | 15/100 [01:03<02:52,  2.03s/it][A
 16%|█▌        | 16/100 [01:04<02:22,  1.70s/it][A
 17%|█▋        | 17/100 [01:32<13:22,  9.67s/it][A
 18%|█▊        | 18/100 [01:33<09:37,  7.04s/it][A
 19%|█▉        | 19/100 [01:3

608.15407371521


In [14]:
start = time.time()
get_input(df[:1000], 128, 6, False)
print(time.time()-start)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(array([[[[-1.        ],
          [-1.        ],
          [-1.        ],
          ...,
          [-1.        ],
          [-1.        ],
          [-1.        ]],
 
         [[-1.        ],
          [-0.49803922],
          [ 1.        ],
          ...,
          [-1.        ],
          [-1.        ],
          [-1.        ]],
 
         [[-1.        ],
          [ 1.        ],
          [ 1.        ],
          ...,
          [-1.        ],
          [-1.        ],
          [-1.        ]],
 
         ...,
 
         [[-1.        ],
          [-1.        ],
          [-1.        ],
          ...,
          [-1.        ],
          [-1.        ],
          [-1.        ]],
 
         [[-1.        ],
          [-1.        ],
          [-1.        ],
          ...,
          [-1.        ],
          [-1.        ],
          [-1.        ]],
 
         [[-1.        ],
          [-1.        ],
          [-1.        ],
          ...,
          [-1.        ],
          [-1.        ],
    

0.4958078861236572


In [None]:
50/2

In [None]:
with open("test_file", 'wb') as of:
    pickle.dump(df, of)
    #np.save(of, df, allow_pickle=True, fix_imports=True)

In [None]:
start = time.time()
a = pickle.load(open("test_file", 'rb'))
print(time.time()-start)

In [None]:
len(a[0])