In [1]:
import pickle

import pandas as pd
import numpy as np
# from surprise import SVD
# from surprise import Dataset
# from surprise import Reader



import catboost as cb

In [2]:
ORDER_ID_COL = 'order_id'
USER_ID_COL = 'user_id'
ORDER_NUMBER_COL = 'order_number'
ORDER_DOW_COL = 'order_dow'
ORDER_HOD_COL = 'order_hour_of_day'
DAY_SINCE_PRIOR_ORDER_COL = 'days_since_prior_order'
PRODUCT_ID_COL = 'product_id'
ADD_TO_CART_ORDER_COL = 'add_to_cart_order'
REORDERED_COL = 'reordered'
AISLE_ID_COL = 'aisle_id'
DEPARTMENT_ID_COL = 'department_id'


TRANSACTIONS_DTYPES = {
    ORDER_ID_COL: np.int32,
    USER_ID_COL: np.int32,
    ORDER_NUMBER_COL: np.int8,
    ORDER_DOW_COL: np.int8,
    ORDER_HOD_COL: np.int8,
    DAY_SINCE_PRIOR_ORDER_COL: np.float16,
    PRODUCT_ID_COL: np.int32,
    ADD_TO_CART_ORDER_COL: np.int16,
    REORDERED_COL: np.int8,
}

PRODUCTS_DTYPES = {
    PRODUCT_ID_COL: np.int32,
    AISLE_ID_COL: np.int16,
    DEPARTMENT_ID_COL: np.int8,
}




def mean_angle(deg):
    np_rect = np.vectorize(lambda x: rect(1, x))
    return degrees(
        phase(
            np.nanmean(np_rect(np.radians(deg)))
        )
    )

def mean_hod(hours):
    hours2degrees = hours / 24 * 360
    mean_hour = mean_angle(hours2degrees)*24/360
    if mean_hour < 0:
        mean_hour = mean_hour + 24
    if mean_hour == 24:
        return 0.0
    return mean_hour

def mean_dow(dow):
    dow2degrees = dow / 7 * 360
    mean_dow = mean_angle(dow2degrees)*7/360
    if mean_dow < 0:
        mean_dow = mean_dow + 7
    if mean_dow == 7:
        return 0.0
    return mean_dow

def total_buy_n5(t=0):
    return (
        transactions
        [transactions['order_number']>t]
        [transactions['order_number']<=(t+5)]
        .groupby(['user_id', 'product_id'])
        .size()
        .rename(f'total_buy_n5_{t}')
    )

In [3]:
from functools import lru_cache
from math import degrees
from cmath import rect, phase


In [4]:
import numpy as np
import pandas as pd


class Features:

    def __init__(self, transactions, products):
        self.transactions = transactions
        self.products = products

    @property
    @lru_cache
    def user_max_orders(self):
        return self.__get_user_max_orders()

    @property
    @lru_cache
    def target(self):
        return self.__get_target()

    @property
    @lru_cache
    def user_product_features(self):
        return self.__get_user_product_features()

    @property
    @lru_cache
    def product_orders_number(self):
        return self.__get_product_orders_number()

    @property
    @lru_cache
    def user_features(self):
        return self.__get_user_features()

    @property
    @lru_cache
    def product_features(self):
        return self.__get_product_features()

    def get_features(self):

        return (
            pd.merge(
                pd.merge(
                    self.user_product_features,
                    self.user_features,
                    on=USER_ID_COL
                ),
                self.product_features,
                on=PRODUCT_ID_COL
            )
            .set_index([USER_ID_COL, PRODUCT_ID_COL])
            .reindex(self.target.index)
        )

    def __get_user_max_orders(self):
        return (
            self.transactions
            .groupby(by=USER_ID_COL)
            [ORDER_NUMBER_COL]
            .max()
            .astype(TRANSACTIONS_DTYPES[ORDER_NUMBER_COL])
        )

    def __get_product_orders_number(self):
        return (
            self.target
            .reset_index()
            .groupby(PRODUCT_ID_COL)
            ['total_reordered'].sum()
            .sort_values(ascending=False)
            .index
        )

    def __get_target(self):
        target = (
            self.transactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [REORDERED_COL].sum()
            .reset_index()
            .rename(columns={REORDERED_COL: 'total_reordered'})
            .astype({
                USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
                PRODUCT_ID_COL: TRANSACTIONS_DTYPES[PRODUCT_ID_COL]
            })
        )

        target['total_reordered'] = target['total_reordered'] / target[USER_ID_COL].map(self.user_max_orders)
        target = target.set_index([USER_ID_COL, PRODUCT_ID_COL]).squeeze()

        return target

    def __get_user_product_features(self):
        user_product = (
            self.transactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [ORDER_NUMBER_COL].max()
            .rename('up_order_num')
            .reset_index()
            .astype({
                USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
                PRODUCT_ID_COL: TRANSACTIONS_DTYPES[PRODUCT_ID_COL],
                'up_order_num': np.int8,
            })
        )

        user_product_days_since_prior_order_max = (
            self.transactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [DAY_SINCE_PRIOR_ORDER_COL]
            .max()
            .reset_index()
            .rename(columns={DAY_SINCE_PRIOR_ORDER_COL: 'user_product_days_since_prior_order_max'})
        )

        user_product_max_orders = (
            self.transactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [ORDER_ID_COL]
            .size()
            .reset_index()
            .rename(columns={ORDER_ID_COL: 'user_item_order_number'})
        )

        user_product = pd.merge(
            user_product,
            user_product_days_since_prior_order_max,
            on=[USER_ID_COL, PRODUCT_ID_COL]
        )
        user_product = pd.merge(
            user_product,
            user_product_max_orders,
            on=[USER_ID_COL, PRODUCT_ID_COL]
        )

        return user_product

    def __get_product_features(self):
        prod_reorder_mean = (
            self.transactions
            .groupby(by=PRODUCT_ID_COL)
             [REORDERED_COL].mean()
            .to_frame('prod_reorder_mean')
            .reset_index()
        )

        products_dt_features = (
            self.transactions
            .groupby(PRODUCT_ID_COL).agg({
                ORDER_DOW_COL: mean_dow,
                ORDER_HOD_COL: mean_hod,
                DAY_SINCE_PRIOR_ORDER_COL: np.mean
            })
            .rename(
                columns={
                    ORDER_DOW_COL: 'mean_order_dow',
                    ORDER_HOD_COL: 'mean_order_hour_of_day',
                    DAY_SINCE_PRIOR_ORDER_COL: 'mean_days_since_prior_order'
                }
            )
            .reset_index()
            .astype({
                'mean_order_dow': np.float32,
                'mean_order_hour_of_day': np.float32,
                'mean_days_since_prior_order': np.float32
            })
        )

        products_features = pd.merge(
            products_dt_features,
            prod_reorder_mean,
            on=PRODUCT_ID_COL
        )

        products_features = pd.merge(
            products_features,
            self.products.set_index(PRODUCT_ID_COL)[[AISLE_ID_COL, DEPARTMENT_ID_COL]],
            on=PRODUCT_ID_COL
        )

        return products_features

    def __get_user_features(self):
        return (
            self.transactions
            .groupby(by=USER_ID_COL)
            [ORDER_NUMBER_COL].max()
            .to_frame('max_orders')
            .reset_index()
            .astype({
                USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
                'max_orders': np.int8
            })
        )


In [5]:
transactions = pd.read_csv(
    'transactions.csv.zip',
    dtype=TRANSACTIONS_DTYPES
)

products = pd.read_csv(
    'products.csv.zip',
    dtype=PRODUCTS_DTYPES
)

In [6]:
features = Features(transactions=transactions, products=products)

In [7]:
X_train = features.get_features()

In [9]:
def train(transactions, products, dump_path='dump_path.pickle', **kwargs):
    features = Features(transactions=transactions, products=products)
    model = cb.CatBoostRegressor(
        n_estimators=kwargs.get('n_estimators', 300),
        max_depth=kwargs.get('max_depth', 2),
        thread_count=kwargs.get('thread_count', 4),
        **kwargs
    )
    model.fit(features.get_features(), features.target)
    model_and_features = {
        'model': model,
        'features': features
    }
    with open(dump_path, 'wb') as file:
        pickle.dump(
            model_and_features,
            file
        )
    return model_and_features


def predict(user_id, k=10, model_and_features=None, dump_path='dump_path.pickle'):
    if model_and_features is None:
        with open(dump_path, 'rb') as file:
            model_and_features = pickle.load(file)
    user_product_features = model_and_features['features'].user_product_features[
        model_and_features['features'].user_product_features[USER_ID_COL].eq(user_id)
    ]
    X = (
            pd.merge(
                pd.merge(
                    user_product_features,
                    model_and_features['features'].user_features,
                    on=USER_ID_COL,
                    how='left'
                ),
                model_and_features['features'],
                on=PRODUCT_ID_COL,
                how='left'
            )
                .set_index([USER_ID_COL, PRODUCT_ID_COL])
        )
    prediction = pd.Series(model_and_features['model'].predict(X), index=X.index, name='prediction')
    prediction = prediction.sort_values(ascending=False).head(k)
    if len(prediction) < k:
        list(prediction) + list(model_and_features['features'].product_orders_number[:k-len(prediction)])
    return prediction



In [11]:
def train(dump_path='dump_path.pickle', **kwargs):
    
    model = cb.CatBoostRegressor(
        n_estimators=kwargs.get('n_estimators', 300),
        max_depth=kwargs.get('max_depth', 2),
        thread_count=kwargs.get('thread_count', 4),
        **kwargs
    )
    model.fit(features.get_features(), features.target)
    return model

model = train()

model_and_features = {
    'model': model,
    'features': features
}

Learning rate set to 0.462875
0:	learn: 0.0930573	total: 702ms	remaining: 3m 29s
1:	learn: 0.0733081	total: 1.11s	remaining: 2m 44s
2:	learn: 0.0633515	total: 1.51s	remaining: 2m 29s
3:	learn: 0.0577135	total: 1.92s	remaining: 2m 21s
4:	learn: 0.0499333	total: 2.34s	remaining: 2m 17s
5:	learn: 0.0470468	total: 2.73s	remaining: 2m 13s
6:	learn: 0.0423209	total: 3.13s	remaining: 2m 10s
7:	learn: 0.0407369	total: 3.51s	remaining: 2m 8s
8:	learn: 0.0374939	total: 3.94s	remaining: 2m 7s
9:	learn: 0.0364719	total: 4.32s	remaining: 2m 5s
10:	learn: 0.0359091	total: 4.7s	remaining: 2m 3s
11:	learn: 0.0333884	total: 5.11s	remaining: 2m 2s
12:	learn: 0.0319783	total: 5.52s	remaining: 2m 1s
13:	learn: 0.0316308	total: 5.9s	remaining: 2m
14:	learn: 0.0284054	total: 6.36s	remaining: 2m
15:	learn: 0.0280384	total: 6.76s	remaining: 2m
16:	learn: 0.0255476	total: 7.19s	remaining: 1m 59s
17:	learn: 0.0249163	total: 7.58s	remaining: 1m 58s
18:	learn: 0.0238534	total: 8.01s	remaining: 1m 58s
19:	learn: 0

158:	learn: 0.0085281	total: 1m 9s	remaining: 1m 1s
159:	learn: 0.0084928	total: 1m 9s	remaining: 1m
160:	learn: 0.0084741	total: 1m 10s	remaining: 1m
161:	learn: 0.0084674	total: 1m 10s	remaining: 1m
162:	learn: 0.0084508	total: 1m 10s	remaining: 59.6s
163:	learn: 0.0084170	total: 1m 11s	remaining: 59.2s
164:	learn: 0.0084049	total: 1m 11s	remaining: 58.7s
165:	learn: 0.0083964	total: 1m 12s	remaining: 58.3s
166:	learn: 0.0083763	total: 1m 12s	remaining: 57.9s
167:	learn: 0.0083687	total: 1m 13s	remaining: 57.4s
168:	learn: 0.0083516	total: 1m 13s	remaining: 57s
169:	learn: 0.0083319	total: 1m 13s	remaining: 56.6s
170:	learn: 0.0083141	total: 1m 14s	remaining: 56.2s
171:	learn: 0.0082888	total: 1m 15s	remaining: 55.8s
172:	learn: 0.0082708	total: 1m 15s	remaining: 55.4s
173:	learn: 0.0082510	total: 1m 15s	remaining: 54.9s
174:	learn: 0.0082336	total: 1m 16s	remaining: 54.5s
175:	learn: 0.0082285	total: 1m 16s	remaining: 54s
176:	learn: 0.0082211	total: 1m 17s	remaining: 53.5s
177:	lea

In [24]:

def predict(user_id, k=10, model_and_features=None, dump_path='dump_path.pickle'):
    if model_and_features is None:
        with open(dump_path, 'rb') as file:
            model_and_features = pickle.load(file)
    user_product_features = model_and_features['features'].user_product_features[
        model_and_features['features'].user_product_features[USER_ID_COL].eq(user_id)
    ]
    X = (
            pd.merge(
                pd.merge(
                    user_product_features,
                    model_and_features['features'].user_features,
                    on=USER_ID_COL,
                    how='left'
                ),
                model_and_features['features'].product_features,
                on=PRODUCT_ID_COL,
                how='left'
            )
                .set_index([USER_ID_COL, PRODUCT_ID_COL])
        )
    prediction = pd.Series(model_and_features['model'].predict(X), index=X.index, name='prediction')
    prediction = list(prediction.sort_values(ascending=False).head(k).reset_index()['product_id'])
    if len(prediction) < k:
        list(prediction) + list(model_and_features['features'].product_orders_number[:k-len(prediction)])
    return prediction


In [25]:
predict(1, 10, model_and_features)

[196, 12427, 10258, 25133, 46149, 13032, 13176, 26405, 26088, 49235]

In [79]:
user_product = (
    transactions
    .groupby([USER_ID_COL, PRODUCT_ID_COL])
    [ORDER_NUMBER_COL].max()
    .rename('up_order_num')
    .reset_index()
    .astype({
        USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
        PRODUCT_ID_COL: TRANSACTIONS_DTYPES[PRODUCT_ID_COL],
        'up_order_num': np.int8,
    })
)

In [13]:
features.target

user_id  product_id
1        196           0.900000
         10258         0.800000
         10326         0.000000
         12427         0.900000
         13032         0.200000
                         ...   
206209   43961         0.153846
         44325         0.000000
         48370         0.000000
         48697         0.000000
         48742         0.076923
Name: total_reordered, Length: 9459065, dtype: float64

In [2]:
ORDER_ID_COL = 'order_id'
USER_ID_COL = 'user_id'
ORDER_NUMBER_COL = 'order_number'
ORDER_DOW_COL = 'order_dow'
ORDER_HOD_COL = 'order_hour_of_day'
DAY_SINCE_PRIOR_ORDER_COL = 'days_since_prior_order'
PRODUCT_ID_COL = 'product_id'
ADD_TO_CART_ORDER_COL = 'add_to_cart_order'
REORDERED_COL = 'reordered'
AISLE_ID_COL = 'aisle_id'
DEPARTMENT_ID_COL = 'department_id'


TRANSACTIONS_DTYPES = {
    ORDER_ID_COL: np.int32,
    USER_ID_COL: np.int32,
    ORDER_NUMBER_COL: np.int8,
    ORDER_DOW_COL: np.int8,
    ORDER_HOD_COL: np.int8,
    DAY_SINCE_PRIOR_ORDER_COL: np.float16,
    PRODUCT_ID_COL: np.int32,
    ADD_TO_CART_ORDER_COL: np.int16,
    REORDERED_COL: np.int8,
}

PRODUCTS_DTYPES = {
    PRODUCT_ID_COL: np.int32,
    AISLE_ID_COL: np.int16,
    DEPARTMENT_ID_COL: np.int8,
}




def mean_angle(deg):
    np_rect = np.vectorize(lambda x: rect(1, x))
    return degrees(
        phase(
            np.nanmean(np_rect(np.radians(deg)))
        )
    )

def mean_hod(hours):
    hours2degrees = hours / 24 * 360
    mean_hour = mean_angle(hours2degrees)*24/360
    if mean_hour < 0:
        mean_hour = mean_hour + 24
    if mean_hour == 24:
        return 0.0
    return mean_hour

def mean_dow(dow):
    dow2degrees = dow / 7 * 360
    mean_dow = mean_angle(dow2degrees)*7/360
    if mean_dow < 0:
        mean_dow = mean_dow + 7
    if mean_dow == 7:
        return 0.0
    return mean_dow

def total_buy_n5(t=0):
    return (
        transactions
        [transactions['order_number']>t]
        [transactions['order_number']<=(t+5)]
        .groupby(['user_id', 'product_id'])
        .size()
        .rename(f'total_buy_n5_{t}')
    )

In [3]:
transactions = pd.read_csv(
    'transactions.csv.zip',
    dtype=TRANSACTIONS_DTYPES
)

products = pd.read_csv(
    'products.csv.zip',
    dtype=PRODUCTS_DTYPES
)

In [39]:
from functools import lru_cache

In [None]:
class Features:
    
    def __init__(self, transactions, products):
        self.transactions = transactions
        self.products = products
          
    def __get_average_order_hod(self):
        return self.transactions.groupby(self.user_id)[self.order_hod].mean()
    
    def __get_average_order_dow(self):
        return self.transactions.groupby(self.user_id)[self.order_dow].mean()
    
    def prepare_features(self):
        features = [
            self.__get_average_order_hod(),
            self.__get_average_order_dow()
        ]
        
        return pd.concat(features, axis=1)
    
    def fit(self):
        pass
    
    @property
    @lru_cache
    def user_max_orders(self):
        return self.__get_user_max_orders()
    
    @property
    @lru_cache
    def target(self):
        return self.__get_target()
    
    @property
    @lru_cache
    def user_product_features(self):
        return self.__get_user_product_features()

    @property
    @lru_cache
    def product_orders_number(self):
        return self.__get_product_orders_number()
    
    @property
    @lru_cache  
    def user_features(self):
        return self.__get_user_features()
    
    @property
    @lru_cache  
    def product_features(self):
        return self.__get_product_features()
    
    
    def get_features(self)
    
        return (
            pd.merge(
                pd.merge(
                    self.user_product_features,
                    self.user_features,
                    on=USER_ID_COL
                ),
                self.product_features,
                on=PRODUCT_ID_COL
            )
            .set_index([USER_ID_COL, PRODUCT_ID_COL])
            .reindex(self.target.index)
        )

    def __get_user_max_orders(self):
        return (
            self.transactions
            .groupby(by=USER_ID_COL)
            [ORDER_NUMBER_COL]
            .max()
            .astype(TRANSACTIONS_DTYPES[ORDER_NUMBER_COL])
        )
    
    def __get_product_orders_number(self):
        return (
            self.target
            .reset_index()
            .groupby(PRODUCT_ID_COL)
            ['total_reordered'].sum()
            .sort_values(ascending=False)
            .index
            
        )


    def __get_target(self):
        
        target = (
            self.transactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [REORDERED_COL].sum()
            .rename(columns={REORDERED_COL: 'total_reordered'})
            .reset_index()
            .astype({
                USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
                PRODUCT_ID_COL: TRANSACTIONS_DTYPES[PRODUCT_ID_COL]
            })
        )
        
        target['total_reordered'] = (
            target['total_reordered']
            /
            target[USER_ID_COL].map(self.user_max_orders)
        )
        
        target = target.set_index([USER_ID_COL, PRODUCT_ID_COL]).squeeze()
        
        return target
    

    def __get_top_products(self, k):
        return set(
            self.target
            .reset_index()
            .groupby(PRODUCT_ID_COL)
            ['total_reordered']
            .sum()
            .sort_values(ascending=False)[:k].index
        )
    
    def __get_user_product_features(self):
        
        user_product = (
            selftransactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
           [ORDER_NUMBER_COL].max()
            .sort_values([ORDER_NUMBER_COL])
            .rename(columns={ORDER_NUMBER_COL: 'up_order_num'})
            .reset_index()
            .astype({
                USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
                PRODUCT_ID_COL: TRANSACTIONS_DTYPES[PRODUCT_ID_COL],
                'up_order_num': np.int8,

            })
        )
        
        user_product_days_since_prior_order_max = (
            self.transactions[
                self.transactions[ORDER_NUMBER_COL] == self.transactions[USER_ID_COL].map(self.user_max_orders)
            ]
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [DAY_SINCE_PRIOR_ORDER_COL]
            .max()
            .reset_index()
            .rename(columns={DAY_SINCE_PRIOR_ORDER_COL: 'user_product_days_since_prior_order_max'})
        )

        user_product_max_orders = (
            transactions
            .groupby([USER_ID_COL, PRODUCT_ID_COL])
            [ORDER_ID_COL]
            .size()
            .reset_index()
            .rename(columns={ORDER_ID_COL: 'user_item_order_number'})
        )

        user_product = pd.merge(
            user_product,
            user_product_days_since_prior_order_max,
            on=[USER_ID_COL, PRODUCT_ID_COL]
        )
        user_product = pd.merge(
            user_product,
            user_product_max_orders,
            on=[USER_ID_COL, PRODUCT_ID_COL]
        )
        
        return user_product
    
    def __get_product_features(self):
    
        prod_reorder_mean = (
            self.transactions
            .groupby(by=PRODUCT_ID_COL)
            [REORDERED_COL].mean()
            .to_frame('prod_reorder_mean')
            .reset_index()
        )


        products_dt_features = (
            transactions
            .groupby(PRODUCT_ID_COL).agg({
                ORDER_DOW_COL: mean_dow,
                ORDER_HOD_COL: mean_hod,
                DAY_SINCE_PRIOR_ORDER_COL: np.mean
            })
            .rename(
                columns={
                    ORDER_DOW_COL: 'mean_order_dow',
                    ORDER_HOD_COL: 'mean_order_hour_of_day',
                    DAY_SINCE_PRIOR_ORDER_COL: 'mean_days_since_prior_order'
                }
            )
            .reset_index()
            .astype({
                'mean_order_dow': np.float32,
                'mean_order_hour_of_day': np.float32,
                'mean_days_since_prior_order': np.float32
            })
        )

        products_features = pd.merge(
                products_dt_features,
                prod_reorder_mean,
                on=PRODUCT_ID_COL
        )

        products_features = pd.merge(
            products_features,
            products.set_index(PRODUCT_ID_COL)[[AISLE_ID_COL, DEPARTMENT_ID_COL]],
            on=PRODUCT_ID_COL
        )
        
        return products_features

    def __get_user_features(self):
        
        return (
            self.transactions
            .groupby(by=USER_ID_COL)
            [ORDER_NUMBER_COL].max()
            .to_frame('max_orders')
            .reset_index()
            .astype({
                USER_ID_COL: TRANSACTIONS_DTYPES[USER_ID_COL],
                'max_orders': np.int8
            })
        )

In [4]:
max_orders = (
    transactions
    .groupby(by='user_id')
    ['order_number']
    .max()
    .astype(np.int8)
)



target = (
    transactions
    .groupby(['user_id', 'product_id'])
    .agg(
        {
            'reordered': 'sum',

        }
    )
    .rename(
        columns={
            'reordered': 'total_reordered',

        }
    )
    .reset_index()
    .astype({
        'user_id': TRANSACTIONS_DTYPES['user_id'],
        'product_id': TRANSACTIONS_DTYPES['product_id']
    })
)

target['total_reordered'] = (
    target['total_reordered']
    /
    target['user_id'].map(max_orders)
)

n=50
top_n_products = set(
    target.groupby('product_id')['total_reordered'].sum().sort_values(ascending=False)[:n].index
)

target = target.set_index(['user_id', 'product_id']).squeeze()

In [5]:
# total_buy_n5_cols = pd.concat([
#     total_buy_n5(0),
#     total_buy_n5(1),
#     total_buy_n5(2),
#     total_buy_n5(-1)
# ], axis=1).reset_index()


user_product = (
    transactions
    .groupby(['user_id', 'product_id'])
    .agg(
        {
            'order_number': 'max',

        }
    )
    .sort_values(
        ['order_number']
    )
    .rename(
        columns={
            'order_number': 'up_order_num',

        }
    )
    .reset_index()
    .astype({
        'user_id': TRANSACTIONS_DTYPES['user_id'],
        'product_id': TRANSACTIONS_DTYPES['product_id'],
        'up_order_num': np.int8,

    })
)

user_product_days_since_prior_order_max = (
    transactions[
        transactions['order_number'] == transactions['user_id'].map(max_orders)
    ]
    .groupby(['user_id', 'product_id'])
    ['days_since_prior_order']
    .max()
    .reset_index()
    .rename(columns={'days_since_prior_order': 'user_product_days_since_prior_order_max'})
)

user_product_max_orders = (
    transactions
    .groupby(['user_id', 'product_id'])
    ['order_id']
    .size()
    .reset_index()
    .rename(columns={'order_id': 'user_item_order_number'})
)




user_product = pd.merge(
    user_product,
    user_product_days_since_prior_order_max,
    on=['user_id', 'product_id']
)
user_product = pd.merge(
    user_product,
    user_product_max_orders,
    on=['user_id', 'product_id']
)

In [6]:
user_product

Unnamed: 0,user_id,product_id,up_order_num,user_product_days_since_prior_order_max,user_item_order_number
0,137129,39618,9,7.0,2
1,182119,30489,9,30.0,3
2,181986,5322,9,27.0,1
3,182319,28553,9,1.0,1
4,182319,24375,9,1.0,1
...,...,...,...,...,...
1079136,171236,22270,99,2.0,2
1079137,171236,21903,99,2.0,5
1079138,82545,6343,99,1.0,1
1079139,171236,21137,99,2.0,33


In [7]:
user_features = (
    transactions
    .groupby(by='user_id')
    ['order_number']
    .max()
    .to_frame('max_orders')
    .reset_index()
    .astype({
        'user_id': TRANSACTIONS_DTYPES['user_id'],
        'max_orders': np.int8
    })
)

In [8]:
prod_reorder_mean = (
    transactions
    .groupby(by='product_id')
    ['reordered'].mean()
    .to_frame('prod_reorder_mean')
    .reset_index()
)


products_dt_features = (
    transactions
    .groupby('product_id').agg({
        'order_dow': mean_dow,
        'order_hour_of_day': mean_hod,
        'days_since_prior_order': np.mean
    })
    .rename(
        columns={
            'order_dow': 'mean_order_dow',
            'order_hour_of_day': 'mean_order_hour_of_day',
            'days_since_prior_order': 'mean_days_since_prior_order'
        }
    )
    .reset_index()
    .astype({
        'mean_order_dow': np.float32,
        'mean_order_hour_of_day': np.float32,
        'mean_days_since_prior_order': np.float32
    })
)

products_features = pd.merge(
        products_dt_features,
        prod_reorder_mean,
        on='product_id'
)

products_features = pd.merge(
    products_features,
    products.set_index('product_id')[['aisle_id', 'department_id']],
    on='product_id'
)

del products_dt_features
del prod_reorder_mean

In [11]:
model = cb.CatBoostRegressor(iterations=300, max_depth=2)

In [12]:
model.fit(
    (
        pd.merge(
            pd.merge(
                user_product,
                user_features,
                on='user_id'
            ),
            products_features,
            on='product_id'
        )
        .set_index(['user_id', 'product_id'])
        .reindex(target.index)
    ),
    target
)

Learning rate set to 0.462875
0:	learn: 0.1070929	total: 575ms	remaining: 2m 51s
1:	learn: 0.1000856	total: 1.14s	remaining: 2m 49s
2:	learn: 0.0967277	total: 1.61s	remaining: 2m 39s
3:	learn: 0.0944335	total: 2.1s	remaining: 2m 35s
4:	learn: 0.0934036	total: 2.56s	remaining: 2m 30s
5:	learn: 0.0925245	total: 3.04s	remaining: 2m 28s
6:	learn: 0.0921207	total: 3.51s	remaining: 2m 27s
7:	learn: 0.0918833	total: 3.96s	remaining: 2m 24s
8:	learn: 0.0912830	total: 4.44s	remaining: 2m 23s
9:	learn: 0.0910748	total: 4.97s	remaining: 2m 24s
10:	learn: 0.0909921	total: 5.46s	remaining: 2m 23s
11:	learn: 0.0907110	total: 5.95s	remaining: 2m 22s
12:	learn: 0.0906178	total: 6.41s	remaining: 2m 21s
13:	learn: 0.0905855	total: 6.87s	remaining: 2m 20s
14:	learn: 0.0903017	total: 7.35s	remaining: 2m 19s
15:	learn: 0.0902169	total: 7.81s	remaining: 2m 18s
16:	learn: 0.0901884	total: 8.35s	remaining: 2m 19s
17:	learn: 0.0900817	total: 8.84s	remaining: 2m 18s
18:	learn: 0.0899978	total: 9.37s	remaining: 

157:	learn: 0.0885288	total: 1m 22s	remaining: 1m 13s
158:	learn: 0.0885283	total: 1m 22s	remaining: 1m 13s
159:	learn: 0.0885277	total: 1m 23s	remaining: 1m 12s
160:	learn: 0.0885271	total: 1m 23s	remaining: 1m 12s
161:	learn: 0.0885265	total: 1m 23s	remaining: 1m 11s
162:	learn: 0.0885260	total: 1m 24s	remaining: 1m 11s
163:	learn: 0.0885256	total: 1m 25s	remaining: 1m 10s
164:	learn: 0.0885251	total: 1m 26s	remaining: 1m 10s
165:	learn: 0.0885245	total: 1m 26s	remaining: 1m 10s
166:	learn: 0.0885235	total: 1m 27s	remaining: 1m 9s
167:	learn: 0.0885231	total: 1m 28s	remaining: 1m 9s
168:	learn: 0.0885223	total: 1m 28s	remaining: 1m 8s
169:	learn: 0.0885214	total: 1m 29s	remaining: 1m 8s
170:	learn: 0.0885202	total: 1m 30s	remaining: 1m 7s
171:	learn: 0.0885200	total: 1m 30s	remaining: 1m 7s
172:	learn: 0.0885194	total: 1m 31s	remaining: 1m 7s
173:	learn: 0.0885187	total: 1m 31s	remaining: 1m 6s
174:	learn: 0.0885179	total: 1m 32s	remaining: 1m 6s
175:	learn: 0.0885175	total: 1m 33s	r

<catboost.core.CatBoostRegressor at 0x7ff5776c74c0>

In [13]:
top_n_products_features = products_features[
    products_features['product_id'].isin(top_n_products)
]

In [14]:
user_product_top_n = pd.concat(
    (
        pd.concat(
            (
                user_df,
                pd.DataFrame(
                    {
                        'user_id': user_id,
                        'product_id': list(top_n_products.difference(user_df['product_id'])),
                        'up_order_num': user_df['up_order_num'].max() + 1,
                        'user_product_days_since_prior_order_max': user_df['user_product_days_since_prior_order_max'].mean(),
                        'user_item_order_number': user_df['user_item_order_number'].mean()
                    }
                )
            )
        )
        for user_id, user_df
        in user_product.groupby('user_id')
    )
)

In [15]:
pred_index = user_product_top_n.set_index(['user_id', 'product_id']).index

In [16]:
pred = model.predict(
    (
        pd.merge(
            pd.merge(
                user_product_top_n,
                user_features.assign(max_orders = user_features['max_orders']+1),
                on='user_id'
            ),
            top_n_products_features,
            on='product_id'
        )
        .set_index(['user_id', 'product_id'])
        .reindex(pred_index)
    )
)

In [15]:
user_id = 1

In [18]:
USER_ID_COL

'user_id'

In [None]:
features.use

In [21]:
features.user_product_features[
    features.user_product_features[USER_ID_COL].eq(user_id)
]


Unnamed: 0,user_id,product_id,up_order_num,user_product_days_since_prior_order_max,user_item_order_number
0,1,196,10,30.0,10
1,1,10258,10,30.0,9
2,1,10326,5,28.0,1
3,1,12427,10,30.0,10
4,1,13032,10,30.0,3
5,1,13176,5,28.0,2
6,1,14084,1,,1
7,1,17122,5,28.0,1
8,1,25133,10,30.0,8
9,1,26088,2,15.0,2


In [None]:
USER_ID_COL

In [17]:
pred = model.predict(
        pd.merge(
            pd.merge(
                user_product,
                user_features,
                on='user_id'
            ),
            products_features,
            on='product_id'
        )
        .set_index(['user_id', 'product_id'])
        .reindex(target.index)
)

In [18]:
pred = pd.Series(
    pred,
    index=target.index,
    name='prediction'
).reset_index()
pred = pred.sort_values(by=['user_id', 'prediction'], ascending=[True, False])

Фигня

In [19]:
x = pred.groupby('user_id').head(10)

In [25]:
size = x.groupby('user_id').size()

In [29]:
p = pd.concat(
    (
        pd.concat(
            (
                user_df,
                pd.DataFrame(
                    {
                        'user_id': user_id,
                        'product_id': list(top_n_products)
                    }
                )
            )
        )
        for user_id, user_df in x[x['user_id'].isin(size[size != 10].index)].groupby('user_id')
    )
)#.groupby('user_id').head(10)

In [36]:
pred = pd.concat(
    (
        x[~x['user_id'].isin(size[size != 10].index)],
        p.groupby('user_id').head(10)
    )
).sort_values(by=['user_id', 'prediction'], ascending=[True, False])

In [37]:
(
    pred
    .astype({'product_id': str})
    .groupby('user_id')
    ['product_id']
    .unique()
    .apply(' '.join)
    .reset_index()
    .to_csv('submission.csv.zip', index=False)
    #.apply(lambda x: ' '.join(s for s in x))
)

In [38]:
pred

Unnamed: 0,user_id,product_id,prediction
0,1,196,0.888258
3,1,12427,0.888258
1,1,10258,0.774044
8,1,25133,0.708016
4,1,13032,0.185702
...,...,...,...
9458998,206209,890,0.046649
9458999,206209,1979,0.046649
9459000,206209,2280,0.046649
9459001,206209,2295,0.046649


In [None]:
{
    'id1': {
        'first_name': 'Olga',
        'second_name': 'Ivanova',
        'age': 25
        
    },
    'id2': {
        'first_name': 'Igor',
        'second_name': 'Mityaev',
        'age': 27        
    }
}

In [29]:
pred = pd.Series(
    pred,
    index=pred_index,
    name='prediction'
).reset_index()
pred = pred.sort_values(by=['user_id', 'prediction'], ascending=[True, False])

ValueError: Length of values (9459065) does not match length of index (5897867)

In [23]:
result = pred.groupby('user_id').head(10)

In [26]:
(
    result
    .astype({'product_id': str})
    .groupby('user_id')
    ['product_id']
    .unique()
    .apply(' '.join)
    .reset_index()
    .to_csv('submission.csv.zip', index=False)
    #.apply(lambda x: ' '.join(s for s in x))
)

## Упаковка в pickle

In [72]:
with open('order_products.pickle', 'wb') as file:
    pickle.dump(
        transactions[[
            'order_id',
            'product_id',
            'add_to_cart_order',
            'reordered'
        ]].drop_duplicates(),
        file
    )


In [63]:
with open('orders.pickle', 'wb') as file:
    pickle.dump(
        transactions[[
            'order_id',
            'user_id',
            'order_number',
            'order_dow',
            'order_hour_of_day',
            'days_since_prior_order'
        ]].drop_duplicates(),
        file
    )


In [65]:
with open('product_aisle.pickle', 'wb') as file:
    pickle.dump(
        products[['product_name', 'aisle_id']],
        file
    )

In [66]:
with open('aisle_department.pickle', 'wb') as file:
    pickle.dump(
        (
            products[['aisle_id', 'department_id']]
            .drop_duplicates()
            .sort_values(by=['department_id', 'aisle_id'])
        ),
        file
    )

In [67]:
with open('aisle_name.pickle', 'wb') as file:
    pickle.dump(
        (
            products[['aisle_id', 'aisle']]
            .drop_duplicates()
            .sort_values(by='aisle_id')
            .set_index('aisle_id')
        ),
        file
    )

In [68]:
with open('department_name.pickle', 'wb') as file:
    pickle.dump(
        (
            products[['department_id', 'department']]
            .drop_duplicates()
            .sort_values(by='department_id')
            .set_index('department_id')
        ),
        file
    )

## Test Users

In [None]:
test_users = np.random.choice(transactions['user_id'].unique(), replace=False, size=10000)
with open('test_users.txt', 'w') as file:
    file.write("/n".join(test_users.astype(str)))
    
train_users = np.array(list(set(transactions['user_id'].unique()) - set(test_users)))

## Development

In [126]:
class Transactions:
    
    def __init__(
        self,
        data,
        user_id=USER_ID_COL,
        order_hod=ORDER_HOD_COL,
        order_dow=ORDER_DOW_COL
    ):
        self.data = data
        self.user_id = user_id
        self.order_hod = order_hod
        self.order_dow = order_dow
        
    def __get_average_order_hod(self):
        return self.data.groupby(self.user_id)[self.order_hod].mean()
    
    def __get_average_order_dow(self):
        return self.data.groupby(self.user_id)[self.order_dow].mean()
    
    def prepare_features(self):
        features = [
            self.__get_average_order_hod(),
            self.__get_average_order_dow()
        ]
        
        return pd.concat(features, axis=1)

In [127]:
tr_features = Transactions(transactions)

In [128]:
tr_features.prepare_features()

Unnamed: 0_level_0,order_hour_of_day,order_dow
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.542373,2.644068
2,10.441026,2.005128
3,16.451220,1.012195
7,13.631068,1.728155
13,15.135802,3.098765
...,...,...
206202,13.715847,2.366120
206206,16.836879,2.336879
206207,13.509524,2.952381
206208,13.762121,2.806061


In [137]:
prod_reorder_mean = (
    transactions
    .groupby(by='product_id')
    ['reordered']
    .mean()
    #.to_frame('prod_reorder_mean')
    #.reset_index()
)
prod_reorder_mean

product_id
1        0.678502
2        0.137500
3        0.789256
4        0.488372
5        0.727273
           ...   
49684    0.200000
49685    0.111111
49686    0.729730
49687    0.444444
49688    0.236364
Name: reordered, Length: 49465, dtype: float64

In [138]:
up_orders_num = (
    transactions
    .groupby(['user_id', 'product_id'])
    ['order_id']
    .count()
    .to_frame('up_orders_num')
    .reset_index()
)
up_orders_num

Unnamed: 0,user_id,product_id,up_orders_num
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3
...,...,...,...
9459060,206209,43961,3
9459061,206209,44325,1
9459062,206209,48370,1
9459063,206209,48697,1


In [139]:
max_orders = (
    transactions
    .groupby(by='user_id')
    ['order_number']
    .max()
    .to_frame('max_orders')
    .reset_index()
)
max_orders

Unnamed: 0,user_id,max_orders
0,1,10
1,2,14
2,3,11
3,7,20
4,13,12
...,...,...
99995,206202,21
99996,206206,66
99997,206207,15
99998,206208,48


In [None]:
data = pd.merge(user_products , user , on="user_id" , how="left")
data = pd.merge(data , products , on="product_id" , how="left")


In [144]:
data = up_orders_num.merge(max_orders, on='user_id' , how='left')

In [145]:
data = data.merge(prod_reorder_mean, on='product_id' , how='left')


In [146]:
data

Unnamed: 0,user_id,product_id,up_orders_num,max_orders,reordered
0,1,196,10,10,0.829210
1,1,10258,9,10,0.757850
2,1,10326,1,10,0.705134
3,1,12427,10,10,0.781142
4,1,13032,3,10,0.713961
...,...,...,...,...,...
9459060,206209,43961,3,13,0.682603
9459061,206209,44325,1,13,0.467761
9459062,206209,48370,1,13,0.753690
9459063,206209,48697,1,13,0.403807


In [129]:
prod_reorder_mean = 
prior_orders.groupby(by="product_id)["reordered"].mean().to_frame("prod_reorder_mean").reset_index()
prod_reorder_mean

Unnamed: 0,product_id,product_name,aisle_id,department_id,aisle,department
0,1,Chocolate Sandwich Cookies,61,19,cookies cakes,snacks
1,2,All-Seasons Salt,104,13,spices seasonings,pantry
2,3,Robust Golden Unsweetened Oolong Tea,94,7,tea,beverages
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,frozen meals,frozen
4,5,Green Chile Anytime Sauce,5,13,marinades meat preparation,pantry
...,...,...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,spirits,alcohol
49684,49685,En Croute Roast Hazelnut Cranberry,42,1,frozen vegan vegetarian,frozen
49685,49686,Artisan Baguette,112,3,bread,bakery
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,cat food care,pets


In [None]:
'UserProductNumberOfPurchases',
'UserTransactionsCount',
'ProductTransactionsCount',
'ProductOrdersCount',
'UserProductOrderHourOfDayMean',
'ProductOrderHourOfDayMean',
'UserProductOrderDowMean',
'ProductOrderDowMean',
'UserProductDaysSincePriorOrderMean',
'ProductDaysSincePriorOrderMean',
'UserProductReorderedCount',
'ProductReorderedCount',
'UserProductOrderNumberMean',
'ProductOrderNumberMean'

In [18]:
f = Features(transactions)
X = f.prepare_features()

In [118]:
transactions.groupby('user_id')['order_number'].max()

user_id
1         10
2         14
3         11
7         20
13        12
          ..
206202    21
206206    66
206207    15
206208    48
206209    13
Name: order_number, Length: 100000, dtype: int8

In [None]:
        last_order_ids = (
            self.df
                .drop_duplicates('order_id')
                .groupby('user_id')
                .tail(max_orders)
                .order_id
        )

## lightFM

In [66]:
transactions['order'] = 1
transactions['order'] = (
    transactions['order']
    /
    transactions['user_id'].map(
        transactions.groupby('user_id')['order_number'].max()
    )
).astype('float16')
vc = transactions['product_id'].value_counts()

In [73]:
vc[vc > 5000]

24852    391170
13176    321553
21137    226279
21903    199961
47209    183702
          ...  
5031       5051
19003      5019
8239       5016
5491       5014
27307      5014
Name: product_id, Length: 884, dtype: int64

In [74]:
(100000*884*16) / (1024*1024*1024)

1.3172626495361328

In [107]:
user_item = transactions[transactions['product_id'].isin(vc[vc > 4500].index)].pivot_table(
    index='user_id',
    columns='product_id',
    values='order',
    aggfunc='sum'
).astype('float32')

In [108]:
user_item.shape

(99832, 983)

In [20]:
from lightfm import LightFM

model = LightFM(no_components=30)

In [96]:
from scipy.sparse import csr_array

In [133]:
user_item

product_id,34,45,196,248,260,311,329,365,432,581,...,49191,49215,49235,49247,49383,49478,49520,49533,49610,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,0.999756,,,,,,,,...,,,0.199951,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,0.090881,,,,,,,...,,,,,,,,,,0.090881
7,,,,,,,,,,,...,,,0.049988,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206202,,,,,0.047607,,,,0.380859,,...,0.047607,,,,,,,,,0.285645
206206,,,,,,,,,,,...,,,0.030304,,,,,,,
206207,,,,,,,,,,,...,,,,,,,,,,
206208,,,,,,0.041656,,0.145798,,,...,,,,0.020828,,,,,,


In [99]:
model.fit(
    csr_array(user_item.fillna(0)), epochs=20, num_threads=3, verbose=True
)

Epoch: 100%|████████████████████████████████████| 20/20 [00:53<00:00,  2.67s/it]


<lightfm.lightfm.LightFM at 0x7f3da2680fa0>

In [132]:
 model.predict(1, [1, 2])

array([6.477238, 6.694567], dtype=float32)

In [113]:
p = model.predict(user_ids=1, item_ids=list(user_item.columns))

ValueError: The item feature matrix specifies more features than there are estimated feature embeddings: 884 vs 49684.

[34,
 45,
 196,
 248,
 260,
 311,
 329,
 365,
 432,
 581,
 651,
 781,
 890,
 1025,
 1090,
 1158,
 1194,
 1215,
 1244,
 1408,
 1463,
 1468,
 1511,
 1529,
 1559,
 1695,
 1700,
 1940,
 1999,
 2078,
 2086,
 2091,
 2120,
 2180,
 2228,
 2295,
 2314,
 2326,
 2450,
 2452,
 2480,
 2611,
 2716,
 2748,
 2825,
 2846,
 2855,
 2962,
 2966,
 3020,
 3142,
 3298,
 3339,
 3376,
 3464,
 3481,
 3583,
 3599,
 3849,
 3896,
 3952,
 3957,
 3990,
 3999,
 4006,
 4086,
 4137,
 4138,
 4149,
 4210,
 4367,
 4421,
 4461,
 4472,
 4562,
 4605,
 4656,
 4724,
 4793,
 4799,
 4920,
 4942,
 4945,
 4957,
 4962,
 5025,
 5031,
 5068,
 5077,
 5134,
 5161,
 5194,
 5212,
 5240,
 5250,
 5258,
 5322,
 5337,
 5373,
 5449,
 5450,
 5456,
 5460,
 5479,
 5491,
 5550,
 5612,
 5646,
 5652,
 5769,
 5782,
 5785,
 5818,
 5876,
 5959,
 5991,
 6046,
 6104,
 6184,
 6187,
 6287,
 6347,
 6348,
 6489,
 6532,
 6615,
 6631,
 6656,
 6740,
 6873,
 6948,
 7021,
 7054,
 7131,
 7156,
 7175,
 7485,
 7500,
 7503,
 7521,
 7559,
 7628,
 7644,
 7751,
 7781,


In [101]:
transactions['user_id'].unique()

array([     1,      2,      3, ..., 206207, 206208, 206209], dtype=int32)

In [105]:
len(vc[vc > 5000].index)

884

In [103]:
list(vc[vc > 5000].index)

[24852,
 13176,
 21137,
 21903,
 47209,
 47766,
 47626,
 27845,
 27966,
 26209,
 16797,
 22935,
 24964,
 45007,
 39275,
 49683,
 5876,
 28204,
 8277,
 40706,
 4920,
 30391,
 45066,
 49235,
 19057,
 42265,
 37646,
 44632,
 17794,
 21616,
 27104,
 27086,
 30489,
 44359,
 31717,
 4605,
 28985,
 8518,
 46979,
 41950,
 22035,
 26604,
 34126,
 39877,
 5077,
 35951,
 10749,
 19660,
 43961,
 43352,
 21938,
 24184,
 9076,
 48679,
 34969,
 46667,
 39928,
 22825,
 5785,
 24838,
 31506,
 12341,
 25890,
 5450,
 35221,
 33731,
 8174,
 27521,
 28842,
 33198,
 27344,
 11520,
 20114,
 43122,
 44142,
 8424,
 28199,
 46906,
 3957,
 18465,
 9839,
 27156,
 15290,
 29487,
 38689,
 4799,
 41787,
 16759,
 9387,
 23909,
 4210,
 42736,
 7781,
 41220,
 196,
 34358,
 47144,
 19678,
 30233,
 20995,
 21709,
 33000,
 34243,
 42828,
 40604,
 5479,
 24489,
 37687,
 6184,
 33754,
 17948,
 432,
 1463,
 26369,
 42768,
 14947,
 16185,
 19348,
 42585,
 22963,
 8021,
 14992,
 8193,
 36011,
 21405,
 39475,
 25659,
 42701,
 2

In [123]:
user_product = transactions.groupby(['user_id', 'product_id'])['order_id'].count()

In [125]:
user_product.sort_values()

user_id  product_id
166584   36420          1
154723   37065          1
82443    9358           1
154723   37417          1
82443    5068           1
                       ..
103593   28204         98
17997    4210          98
41356    14366         99
         38652         99
         6583          99
Name: order_id, Length: 9459065, dtype: int64