In [1]:
# this notebook loads and manipulates E-Corp's 2-week dataset that was given to us, uses
# collaborative filtering to train an item-based rec engine, generates and saves the
# recommendations into Excel
# (use items in each order number(the user) to score similarity of each item)

__author__ = 'Josh Bacher'
__email__ = 'bacherjp@gmail.com'
__website__ = 'https://github.com/jpbacher'


import numpy as np
import pandas as pd
import tqdm


class Data:

    def __init__(self, file, basket_size):
        '''initiate Data class & retrieve orders that are greater than or equal to basket size'''
        self.basket_size = basket_size
        self.data = self._get_data(file)

    def _get_data(self, df, preprocess=True):

        df = self._read_data(file)
        if preprocess:
            df = self._discard_basket_size(df, self.basket_size)
            df = self._ohe(df)
            df = self._remove_features(df)
            df = self._combine_orders(df)
        return df

    def _read_data(self, file):

        return pd.read_csv(file, sep='\t')

    def _discard_basket_size(self, df,  basket_size, order_feat='order_number'):
        '''drop orders less than the basket size'''
        df = df.groupby(order_feat).filter(lambda x: len(x) >= basket_size)
        return df

    def _ohe(self, df, feat='l3'):
        '''one-hot encodes the lowest level featue & concatenates back to the original dataframe'''
        dummy = pd.get_dummies(df[feat])
        return pd.concat([df, dummy], axis=1)

    def _remove_features(self, df, features=['l1', 'l2', 'l3', 'sku', 'brand']):
        '''remove all original features except order_number (the user feature)'''
        return df.drop(features, axis=1)

    def _combine_orders(self, df, order_feat='order_number'):
        '''combine each order into single instance & sum all features except order number'''
        return df.groupby(order_feat).sum().reset_index()

In [2]:
file = "/mnt/c/projects/ecorp-recommendations/data/alltransactions.txt"
basket_size = 10

data = Data(file, basket_size)