![img](https://www.artetrama.com/uploads/articles/andy-warhol-marilyn-series.jpeg)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import sys, os, gc, types
import time
import math

In [2]:
root_paths = [
    "/data/kaggle-instacart/",
    "/Users/jiayou/Dropbox/珺珺的程序/Kaggle/Instacart/",
    "/Users/jiayou/Dropbox/Documents/珺珺的程序/Kaggle/Instacart/"
]
root = None
for p in root_paths:
    if os.path.exists(p):
        root = p
        break

In [3]:
def load_data(path_data):
    priors = pd.read_csv(path_data + 'order_products__prior.csv', 
                     dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    train = pd.read_csv(path_data + 'order_products__train.csv', 
                    dtype={
                            'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})
    orders = pd.read_csv(path_data + 'orders.csv', 
                         dtype={
                                'order_id': np.int32,
                                'user_id': np.int64,
                                'eval_set': 'category',
                                'order_number': np.int16,
                                'order_dow': np.int8,
                                'order_hour_of_day': np.int8,
                                'days_since_prior_order': np.float32})    
    return priors, train, orders

In [5]:
priors, train, orders = load_data(root)

In [6]:
priors_indexed = priors.set_index('order_id', drop=False)

In [10]:
da_start = 4
da_step = 1
n_shards = 4
shard = 0

In [4]:
def gen_id():
    n = int(1e8) + shard
    while True:
        yield n
        n += n_shards
gid = gen_id()

In [8]:
def copy_order_products(old_oid, new_oid, results):
    data = priors_indexed.loc[old_oid, :]
    if type(data) == pd.core.series.Series:
        r = data.copy()
        r.order_id = new_oid
        results.append(r)
    else:
        for index, row in data.iterrows():
            r = row.copy()
            r.order_id = new_oid
            results.append(r)
        

In [11]:
aug_orders = []
aug_priors = []
aug_train = []

cnt = 0
n_user = orders.user_id.nunique()
start_time = time.time()

for user_id, group in orders.groupby('user_id'):
    if user_id % n_shards != shard:
        continue
    group = group.sort_values(by='order_number')
    cur_priors = []
    for index, row in group.iterrows():
        if row.order_number >= da_start \
                and row.order_number < len(group) \
                and (row.order_number - da_start) % da_step == 0:
            user_id = next(gid)
            order_id = next(gid)
            
            for order in cur_priors:
                od = order.copy()
                od.order_id = next(gid)
                od.user_id = user_id
                aug_orders.append(od)
                
                copy_order_products(order.order_id, od.order_id, aug_priors)
            
            od = row.copy()
            od.user_id = user_id
            od.order_id = order_id
            od.eval_set = 'train'
            aug_orders.append(od)
            
            copy_order_products(row.order_id, order_id, aug_train)
            
        cur_priors.append(row)
        
    cnt += 1
    if cnt % 100 == 0:
        remaining_time = (time.time() - start_time) / cnt * (n_user - cnt)
        print("{} users processed\t{:.2f}s remaining".format(cnt, remaining_time))

print("{} users processed\nDone".format(cnt))

100 users processed	92082.82s remaining
200 users processed	90033.72s remaining
300 users processed	112954.30s remaining
400 users processed	115382.03s remaining


KeyboardInterrupt: 

In [None]:
priors_df = pd.DataFrame(aug_priors)
train_df = pd.DataFrame(aug_train)
orders_df = pd.DataFrame(aug_orders)

# priors_df = pd.concat([priors, priors_df])
# train_df = pd.concat([train, train_df])
# orders_df = pd.concat([orders, orders_df])

print('priors', priors_df.shape, priors.shape)
print('train', train_df.shape, train.shape)
print('orders', orders_df.shape, orders.shape)

In [None]:
priors_df.to_csv(os.path.join(root, 'order_products__prior.aug{}.csv'.format(shard)), index = False)
train_df.to_csv(os.path.join(root, 'order_products__train.aug{}.csv'.format(shard)), index = False)
orders_df.to_csv(os.path.join(root, 'orders.aug{}.csv'.format(shard)), index = False)

----

# Explore

In [73]:
import math
da_nnew_order = np.array([math.ceil((x - da_start) / da_step) for x in orders.groupby('user_id').order_id.count()]).sum()
da_nnew_order

2596247

In [59]:
orders.groupby('eval_set')['order_id'].count()

eval_set
prior    3214874
test       75000
train     131209
Name: order_id, dtype: int64

In [55]:
orders.groupby('user_id').order_id.count().sum()

3421083

In [53]:
type(orders.loc[0]) == pd.core.series.Series

True

In [7]:
orders.user_id.nunique()

206209

In [12]:
order_prod_cnt = priors.groupby('order_id').product_id.count()

In [13]:
order_prod_cnt

order_id
2           9
3           8
4          13
5          26
6           3
7           2
8           1
9          15
10         15
11          5
12         15
13         13
14         11
15          5
16          3
18         28
19          3
20          8
21          5
22         14
23         14
24          3
25         14
26          8
27         27
28         16
29          5
30          3
31         10
32          9
           ..
3421048     8
3421050    13
3421051    31
3421052     2
3421053     9
3421055    19
3421057     5
3421059     6
3421060    17
3421061    22
3421062     7
3421064     3
3421065     5
3421066     6
3421067     1
3421068    14
3421069    12
3421071     5
3421072    12
3421073     2
3421074     4
3421075     8
3421076     8
3421077     4
3421078     9
3421079     1
3421080     9
3421081     7
3421082     7
3421083    10
Name: product_id, dtype: int64