In [1]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
from time import time
import re
import glob
import warnings

# tools for data preproc/loading
import torch
import rmm
import nvtabular as nvt
from nvtabular.ops import Normalize,  Categorify,  LogOp, FillMissing, Clip, get_embedding_sizes
from nvtabular.loader.torch import TorchAsyncItr, DLDataLoader
from nvtabular.utils import device_mem_size
import cudf

# tools for training
from fastai.basic_train import Learner
from fastai.basic_data import DataBunch
from fastai.tabular import TabularModel
from fastai.metrics import accuracy

In [3]:
# define some information about where to get our data
INPUT_DATA_DIR = os.environ.get('INPUT_DATA_DIR', '/raid/criteo/tests/crit_int_pq')
OUTPUT_DATA_DIR = os.environ.get('OUTPUT_DATA_DIR', '/raid/criteo/tests/test_dask') # where we'll save our procesed data to
BATCH_SIZE = int(os.environ.get('BATCH_SIZE', 800000))
PARTS_PER_CHUNK = int(os.environ.get('PARTS_PER_CHUNK', 10))
SHUFFLE = True
NUM_TRAIN_DAYS = 23 # number of days worth of data to use for training, the rest will be used for validation

# define our dataset schema
CONTINUOUS_COLUMNS = ['I' + str(x) for x in range(1,14)]
CATEGORICAL_COLUMNS =  ['C' + str(x) for x in range(1,27)]
LABEL_COLUMNS = ['label']
COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS + LABEL_COLUMNS

In [4]:
output_train_dir = os.path.join(OUTPUT_DATA_DIR, 'train/')
output_valid_dir = os.path.join(OUTPUT_DATA_DIR, 'valid/')

In [5]:
rmm.reinitialize(pool_allocator=True, initial_pool_size=int(0.3 * device_mem_size(kind='free')/256) * 256)



In [6]:
train_paths = glob.glob(os.path.join(output_train_dir, "*.parquet"))[:4]
valid_paths = glob.glob(os.path.join(output_valid_dir, "*.parquet"))[:4]

In [7]:
train_data = nvt.Dataset(train_paths, engine="parquet", part_mem_fraction=0.04/PARTS_PER_CHUNK)
valid_data = nvt.Dataset(valid_paths, engine="parquet", part_mem_fraction=0.04/PARTS_PER_CHUNK)

In [8]:
class WritePartChunk:
    """
    Write one partition before it has been collated into a chunk.
    """
    def _exec(self, gdf):
        """
        Ensure only one parition represented.
        """
        self.f_name = open("./rando/part_chunk.txt", "a+")
        if gdf["part_idx"].min() == gdf["part_idx"].max():
            part_idx = gdf["part_idx"][0]
            self.f_name.write(f"{part_idx} {gdf.shape[0]} \n")
    def __del__(self):
        self.f_name.close()
        
class WriteChunkShuffle:
    """
    Write one chunk at a time, consisting of multiple partitions
    """
    def __init__(self):
        self.batch_count = 0
        self.pdf = None
    
    def _exec(self, gdf):
        new_gdf = cudf.DataFrame()
        new_gdf["part_idx"] = cudf.Series(gdf[1][:,-1])
        new_gdf.to_parquet(f"./rando/batch_{self.batch_count}.parquet")
        self.batch_count = self.batch_count + 1

callbacks = {}
callbacks["PART_CHUNK"] = [WritePartChunk()]
callbacks["BATCH_GET"] = [WriteChunkShuffle()]


train_data_itrs = TorchAsyncItr(
    train_data,
    batch_size=BATCH_SIZE,
    cats=CATEGORICAL_COLUMNS,
    conts=CONTINUOUS_COLUMNS,
    labels=LABEL_COLUMNS,
    parts_per_chunk=PARTS_PER_CHUNK,
    callbacks=callbacks,
    shuffle=SHUFFLE,
)
valid_data_itrs = TorchAsyncItr(
    valid_data,
    batch_size=BATCH_SIZE,
    cats=CATEGORICAL_COLUMNS,
    conts=CONTINUOUS_COLUMNS,
    labels=LABEL_COLUMNS,
    parts_per_chunk=PARTS_PER_CHUNK,
    callbacks=callbacks,
    shuffle=SHUFFLE,
)

In [9]:
def gen_col(batch):
    batch = batch[0]
    return (batch[0], batch[1][:,:-1]), batch[2].long()

In [10]:
train_dataloader = DLDataLoader(train_data_itrs, collate_fn=gen_col, pin_memory=False, num_workers=0)
valid_dataloader = DLDataLoader(valid_data_itrs, collate_fn=gen_col, pin_memory=False, num_workers=0)
databunch = DataBunch(train_dataloader, valid_dataloader, collate_fn=gen_col, device="cuda")

In [11]:
embeddings = [(7599500, 16),
 (5345303, 16),
 (561810, 16),
 (242827, 16),
 (11, 6),
 (2209, 16),
 (10616, 16),
 (100, 16),
 (4, 3),
 (968, 16),
 (15, 7),
 (33521, 16),
 (7838519, 16),
 (2580502, 16),
 (6878028, 16),
 (298771, 16),
 (11951, 16),
 (97, 16),
 (35, 12),
 (17022, 16),
 (7339, 16),
 (20046, 16),
 (4, 3),
 (7068, 16),
 (1377, 16),
 (63, 16)]

In [12]:
model = TabularModel(emb_szs=embeddings, n_cont=len(CONTINUOUS_COLUMNS), out_sz=2, layers=[512, 256])
learn =  Learner(databunch, model, metrics=[accuracy])
learn.loss_func = torch.nn.CrossEntropyLoss()

In [13]:
learning_rate = 1.32e-2
epochs = 1
start = time()
learn.fit_one_cycle(epochs, learning_rate)
t_final = time() - start
print(t_final)

epoch,train_loss,valid_loss,accuracy,time
0,0.170289,0.128761,0.96651,02:33


153.62917709350586


In [14]:
train_data_itrs._buff.shuffle

True