In [1]:
# this notebook loads and manipulates E-Corp's 2-week dataset that was given to us, uses
# collaborative filtering to train an item-based rec engine, generates and saves the
# recommendations into Excel
# (use items in each order number(the user) to score similarity of each item)

__author__ = 'Josh Bacher'
__email__ = 'bacherjp@gmail.com'
__website__ = 'https://github.com/jpbacher'


import numpy as np
import pandas as pd
import tqdm


class Data:

    def __init__(self, file, basket_size):
        '''initiate Data class & retrieve orders that are greater than or equal to basket size'''
        self.basket_size = basket_size
        self.data = self._get_data(file)

    def _get_data(self, df, preprocess=True):

        df = self._read_data(file)
        if preprocess:
            df = self._discard_basket_size(df, self.basket_size)
            df = self._ohe(df)
            df = self._remove_features(df)
            df = self._combine_orders(df)
        return df

    def _read_data(self, file):

        return pd.read_csv(file, sep='\t')

    def _discard_basket_size(self, df,  basket_size, order_feat='order_number'):
        '''drop orders less than the basket size'''
        return df.groupby(order_feat).filter(lambda x: len(x) >= basket_size)


    def _ohe(self, df, feat='l3'):
        '''one-hot encodes the lowest level featue & concatenates back to the original dataframe'''
        dummy = pd.get_dummies(df[feat])
        return pd.concat([df, dummy], axis=1)

    def _remove_features(self, df, features=['l1', 'l2', 'l3', 'sku', 'brand']):
        '''remove all original features except order_number (the user feature)'''
        return df.drop(features, axis=1)

    def _combine_orders(self, df, order_feat='order_number'):
        '''combine each order into single instance & sum all features except order number'''
        return df.groupby(order_feat).sum().reset_index()

In [1]:
class Recommend():
    
    def __init__(self, data, user_feat, item_feat, cf_method='item', similarity='jaccard'):
        '''initiate Recommend class'''
        self.data = data
        self.user_feat = user_feat
        self.item_feat = item_feat
        self.cf_method = cf_method
        self.similarity = similarity
        self.sim_matrix = self._generate_sim_matrix(self.cf_method)
        self.user_scores = []
        self.recs = []
        
    def _generate_sim_matrix(self, cf_method):
        
        self.sim_matrix = self._get_empty_df(self.cf_method)
        return _load_sim_matrix(self.sim_matrix, self.similarity)
        
        
    def _get_empty_df(self, cf_method):
        
        if cf_method == 'user':
            labels = self.data[user_feat]
        elif cf_method == 'item':
            labels = self.item_feat
        else:
            raise ValueError ('Invalid method; pass "user" or "item"')
        
        return pd.DataFrame(index=labels, columns=labels)
            
    def _load_sim_matrix(self, sim_matrix, similarity):
        
        k = 0
        item_df = self.data[self.item_feat]
        progress = tqdm(total = sim_matrix.shape[0], mininterval=5)
        for i in range(sim_matrix.shape[0]):
            progress.update()
            sim_matrix.iloc[i, i] = 1.0
            x = item_df.iloc[:, i] 
            for j in range(i, sim_matrix.shape[1]):
                y = item_df.iloc[:, j]
                sim_matrix.iloc[i, j] = self._retrieve_similarity(x, y, similarity)
                sim_matrix.iloc[j, i] = sim_matrix.iloc[i, j]
    
    def _retrieve_similarity(self, x, y, similarity):
        
        if similarity == 'pearson_r':
            return self._pearson_sim(x, y)
        elif similarity == 'cosine':
            return self._cosine_sim(x, y)
        elif similarity == 'jaccard':
            return self._jaccard_sim(x, y)
        
    def _pearson_sim(self, x, y):
    
        return ((x * y).mean() - x.mean() * y.mean()) / (x.std() * y.std())
        
    def _cosine_sim(self, x, y):
        
        return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
    
    def _jaccard_sim(self, x, y):
        
        nonzero_x = set(np.nonzero(x)[0])
        nonzero_y = set(np.nonzero(y)[0])
        intersection_card = len(set.intersection(nonzero_x, nonzero_y))
        union_card = len(set.union(nonzero_x, nonzero_y))
        if union_card == 0:
            return 0
        else:
            return (intersection_card / union_card)        

In [2]:
file = "/mnt/c/projects/ecorp-recommendations/data/alltransactions.txt"
basket_size = 10

retrieve_user_item = True

data = Data(file, basket_size)

if retrieve_user_item:
    user_feat = 'order_number'
    item_feat = list(data.data.columns)
    item_feat.remove(user_feat)