![img](https://www.artetrama.com/uploads/articles/andy-warhol-marilyn-series.jpeg)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sys, os, gc, types
import time
import math

In [2]:
root_paths = [
    "/data/kaggle-instacart/",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart/",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart/"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [3]:
def load_data(path_data):
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})    
    return priors, train, orders

In [4]:
da_start = 4
da_step = 1
n_shards = 4
shard = 0

In [5]:
priors, train, orders = load_data(root)

In [6]:
priors = priors.merge(orders[['order_id', 'user_id']], on='order_id', how='left')
train = train.merge(orders[['order_id', 'user_id']], on='order_id', how='left')
    
orders = orders[orders.user_id % n_shards == shard]
priors = priors[priors.user_id % n_shards == shard]
train = train[train.user_id % n_shards == shard]

priors.drop('user_id', inplace = True, axis=1)
train.drop('user_id', inplace = True, axis=1)

priors = priors.set_index('order_id', drop=False)
gc.collect()

7

In [7]:
def gen_id():
    n = int(1e8) + shard
    while True:
        yield n
        n += n_shards
gid = gen_id()

In [33]:
def copy_order_products(old_oid, new_oid, results):
    data = priors.loc[old_oid, :]
    if type(data) == pd.core.series.Series:
        r = data.to_dict()
        r['order_id'] = new_oid
        results.append(r)
    else:
        for index, row in data.iterrows():
            r = row.to_dict()
            r['order_id'] = new_oid
            results.append(r)
        

In [44]:
class Timeit:
    def __init__(self, process_name, verbose=1):
        self.process_name = process_name
        self.verbose = verbose
    def __enter__(self):
        if self.verbose:
            print(self.process_name + " starts...")
            self.begin_time = time.time()
    def __exit__(self, type, value, traceback):
        if self.verbose:
            end_time = time.time()
            # print(self.process_name + " ended.")
            print('{}: {:.2f}s'.format(self.process_name, end_time - self.begin_time))

aug_orders = []
aug_priors = []
aug_train = []

cur_microshard = 0

def save_microshard():
    global aug_orders
    global aug_priors
    global aug_train
    global cur_microshard
    with Timeit('mk df'):
        priors_df = pd.DataFrame(aug_priors)
        train_df = pd.DataFrame(aug_train)
        orders_df = pd.DataFrame(aug_orders)
    
    with Timeit('to csv'):
        priors_df.to_csv(os.path.join(root, 'aug', 'order_products__prior.aug{}-{}.csv'.format(shard, cur_microshard)), index = False)
        train_df.to_csv(os.path.join(root, 'aug', 'order_products__train.aug{}-{}.csv'.format(shard, cur_microshard)), index = False)
        orders_df.to_csv(os.path.join(root, 'aug', 'orders.aug{}-{}.csv'.format(shard, cur_microshard)), index = False)
    
    cur_microshard += 1
    
    aug_orders = []
    aug_priors = []
    aug_train = []
    

cnt = 0
n_user = orders.user_id.nunique()
start_time = time.time()

for _, group in orders.groupby('user_id'):
    group = group.sort_values(by='order_number')
    cur_priors = []
    for _, row in group.iterrows():
        if row.order_number >= da_start \
                and row.order_number < len(group) \
                and (row.order_number - da_start) % da_step == 0:
            user_id = next(gid)
            order_id = next(gid)
            
            for order in cur_priors:
                od = dict(order)
                od['order_id'] = next(gid)
                od['user_id'] = user_id
                aug_orders.append(od)
                
                copy_order_products(order['order_id'], od['order_id'], aug_priors)
            
            od = row.to_dict()
            od['user_id'] = user_id
            od['order_id'] = order_id
            od['eval_set'] = 'train'
            aug_orders.append(od)
            
            copy_order_products(row.order_id, order_id, aug_train)
            
        cur_priors.append(row.to_dict())
        
    cnt += 1
    if cnt % 100 == 0:
        remaining_time = (time.time() - start_time) / cnt * (n_user - cnt)
        print("{} users processed\t{:.2f}s remaining".format(cnt, remaining_time))
        
    if cnt % 1000 == 0:
        save_microshard()
        with Timeit('gc'):
            gc.collect()

if cnt % 1000 != 0:
    save_microshard()
    gc.collect()
    
print("{} users processed\nDone".format(cnt))

----

# Explore

In [73]:
import math
da_nnew_order = np.array([math.ceil((x - da_start) / da_step) for x in orders.groupby('user_id').order_id.count()]).sum()
da_nnew_order

2596247

In [59]:
orders.groupby('eval_set')['order_id'].count()

eval_set
prior    3214874
test       75000
train     131209
Name: order_id, dtype: int64

In [55]:
orders.groupby('user_id').order_id.count().sum()

3421083

In [32]:
train.iloc[0].to_dict() #.memory_usage(deep=True)

{'add_to_cart_order': 1, 'order_id': 1, 'product_id': 49302, 'reordered': 1}

In [53]:
type(orders.loc[0]) == pd.core.series.Series

True

In [7]:
orders.user_id.nunique()

206209

In [32]:
uocnt = orders.groupby('user_id').agg({'order_number': {'user_order_cnt': 'max'}})
uocnt.columns = uocnt.columns.droplevel(0)
uocnt.reset_index(inplace=True, drop=False)

odf = priors.groupby('order_id').count().reset_index(drop=False)[['order_id', 'product_id']]
odf['prod_cnt'] = odf.product_id
odf = odf.merge(orders[['order_id', 'user_id', 'order_number']], how='left', on='order_id')
odf = odf.merge(uocnt, how='left', on='user_id')
odf.drop(['product_id', 'user_id'], axis=1, inplace=True)

In [33]:
odf

Unnamed: 0,order_id,prod_cnt,order_number,user_order_cnt
0,2,9,3,9
1,3,8,16,26
2,4,13,36,57
3,5,26,42,53
4,6,3,4,9
5,7,2,11,13
6,8,1,5,28
7,9,15,14,24
8,10,15,4,25
9,11,5,4,16


In [38]:
n_new_priors = np.multiply((odf.user_order_cnt - odf.order_number - 1), odf.prod_cnt).sum()
n_new_priors

516048640