In [46]:
#https://www.kaggle.com/paulantoine/light-gbm-benchmark-0-3692

In [6]:
import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = 'input/'


print('loading priors')
priors = pd.read_csv(IDIR + 'order_products__prior.csv',
                     dtype={ 'order_id': np.int32,
                            'product_id': np.uint16,
                            'add_to_cart_order': np.int16,
                            'reordered': np.int8})

loading priors


In [7]:
print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [8]:
print(f'computing proudcts, len:{len(products)}')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on="product_id")
products.set_index('product_id', drop=False, inplace=True)

computing proudcts, len:49688


In [9]:
#tmp = priors['reordered'].groupby(priors.product_id).sum()
products.head()
priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [10]:
print("add order info to priors")
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on="order_id", rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


In [11]:
usr = pd.DataFrame()
usr['average_days_between_orders'] = \
    orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)

In [12]:
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.float32)
#users = users.join(usr)

In [13]:
users = users.join(usr)
del usr

In [14]:
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print(f'users shape: {users.shape}')

users shape: (206209, 6)


In [15]:
print('compute userXproduct f -this is long')
priors['user_product'] = priors.product_id + priors.user_id * 100000

compute userXproduct f -this is long


In [21]:
d = dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1, 
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0]+1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order)
print("to dataframe (less memory)")
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d

to dataframe (less memory)


In [20]:
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np,int32)

20227933120     5
20227928985     5
20227909327     1
20227945918     5
20227930035     3
20227917794     7
20227940141     5
20227901819     2
20227943668     3
20597033754    17
20597024838    14
20597017704    13
20597021903    14
20597017668     6
20597046667    13
20597017461     4
20597032665     6
17852046842    17
17852026434     4
17852039758    19
17852027761    49
17852010054    29
17852021351    33
17852022598    12
17852034862    16
17852040285     4
17852017616    19
17852025146    21
17852032645     9
17852041276    46
               ..
12448526919     1
12448534270     1
5005031553      1
5005004302      1
5005010246      1
5005006999      1
5005026209      1
5005006473      1
5005020061      1
10351049187     1
10351020126     1
16718529066     1
16718549383     1
19322548101     1
10868730136     1
5272638061      1
11707638185     1
11707632299     1
11707603060     1
11707620539     1
11707635221     1
11707612861     1
17518512023     1
17518547941     1
2524707854