In [1]:
# this notebook loads and alters E-Corp's 2-week dataset that was given to us, uses
# collaborative filtering to train an item-based rec engine, generates and saves the
# specified number of recommendations per customer into an Excel file

__author__ = 'Josh Bacher'
__email__ = 'bacherjp@gmail.com'
__website__ = 'https://github.com/jpbacher'


import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
import warnings
warnings.filterwarnings('ignore')


class Data:

    def __init__(self, file, basket_size):
        '''initiate Data class & retrieve orders that are greater than or equal to basket size'''
        self.basket_size = basket_size
        self.data = self._get_data(file)

    def _get_data(self, df, preprocess=True):
        '''retrieve the file given & preprocess the dataset'''
        df = self._read_data(file)
        if preprocess:
            df = self._discard_basket_size(df, self.basket_size)
            df = self._ohe(df)
            df = self._remove_features(df)
            df = self._combine_orders(df)
        return df

    def _read_data(self, file):
        '''read the tab-separated file given'''
        return pd.read_csv(file, sep='\t')

    def _discard_basket_size(self, df,  basket_size, order_feat='order_number'):
        '''drop orders less than the basket size'''
        return df.groupby(order_feat).filter(lambda x: len(x) >= basket_size)

    def _ohe(self, df, feat='l3'):
        '''one-hot encode the lowest level featue & concatenate back to the original dataframe'''
        dummy = pd.get_dummies(df[feat])
        return pd.concat([df, dummy], axis=1)

    def _remove_features(self, df, features=['l1', 'l2', 'l3', 'sku', 'brand']):
        '''remove all original features except order_number (the user feature)'''
        return df.drop(features, axis=1)

    def _combine_orders(self, df, order_feat='order_number'):
        '''combine each order into single instance & sum all features except order number'''
        return df.groupby(order_feat).sum().reset_index()


class Recommend():

    def __init__(self, data, user_feat, item_feats, number_of_recs, similarity_metric):
        '''initiate Recommend class'''
        self.data = data
        self.user_feat = user_feat
        self.item_feats = item_feats
        self.number_of_recs = number_of_recs
        self.similarity_metric = similarity_metric
        self.sim_matrix = self._generate_sim_matrix(self.similarity_metric)
        self.user_scores_df = self._retrieve_user_scores(self.sim_matrix)
        self.recs_df = self._fetch_recs(self.number_of_recs)

    def _generate_sim_matrix(self, similarity_metric):
        '''get a similarity matrix for the stated metric'''
        # find the similarity between vectors, given the similarity metric
        similarities = 1 - pdist(self.data[self.item_feats].T, metric=similarity_metric)
        # convert back to a square matrix
        sim_matrix = squareform(similarities)
        # convert back to Pandas dataframe
        sim_matrix = pd.DataFrame(
            sim_matrix, index=self.item_feats, columns=self.item_feats)
        return sim_matrix

    def _retrieve_user_scores(self, sim_matrix):
        '''get the user/customer score'''
        user_df = self.data.set_index(self.user_feat)
        # get the dot product of the user dataframe and similarity matrix
        user_scores_df = user_df.dot(sim_matrix)
        return user_scores_df

    def _fetch_recs(self, rec_number):
        '''retrieve recommendations for each customer'''
        users = self.data.loc[:, self.user_feat]
        rec_feats = ['Rec ' + str(i) for i in range(1, rec_number + 1)] + \
            ['Score ' + str(i) for i in range(1, rec_number + 1)]
        recs_df = pd.DataFrame(index=users, columns=rec_feats)

        for customer in users:
            items_sorted = self.user_scores_df.sort_values(
                by=customer, ascending=False, axis=1).loc[customer, :].index
            for i in range(rec_number):
                item = items_sorted[i]
                item_feat = rec_feats[i]
                score_feat = rec_feats[i + rec_number]
                recs_df.loc[customer, item_feat] = item
                recs_df.loc[customer,
                            score_feat] = self.user_scores_df.loc[customer, item]

        recs_df.reset_index(inplace=True, drop=False)
        return recs_df

    def output_recs(self, name='recommendations', file_type='excel'):
        '''save output to file'''
        if file_type == 'csv':
            ext = '.csv'
            self.recs_df.to_csv(
                '/mnt/c/projects/ecorp-recommendations/recs/{}'.format(name + ext), index=False)
        elif file_type == 'excel':
            ext = '.xlsx'
            self.recs_df.to_excel(
                '/mnt/c/projects/ecorp-recommendations/recs/{}'.format(name + ext), index=False)
        else:
            raise ValueError('Invalid file type; either "csv" or "excel"')
            
    def sample_print(self, number_samples):
        '''print a sample of the recommendations'''
        print(self.recs_df.head(number_samples))

In [2]:
file = "/mnt/c/projects/ecorp-recommendations/data/alltransactions.txt"
basket_size = 20
number_of_recs = 3

data = Data(file, basket_size)

user_feat = 'order_number'
item_feats = list(data.data.columns)
item_feats.remove(user_feat)
    
rec = Recommend(data.data, user_feat, item_feats, number_of_recs, similarity_metric='jaccard')
rec.output_recs()
rec.sample_print(15)

    order_number                             Rec 1  \
0          66334                         Ice Totes   
1          66361                  Artificial Trees   
2          66619                      Toilet Paper   
3          66768                    Safety Glasses   
4          66849                         Ear Plugs   
5          66883                      Toilet Paper   
6          66916                Locking Plier Sets   
7          67077       Disposable Towels and Wipes   
8          67226                      Toilet Paper   
9          67250            Cable Tray Accessories   
10         67254              Duct and Cloth Tapes   
11         67295                      Lock Washers   
12         67329         Attic Exhaust Ventilators   
13         67360              Staplers and Tackers   
14         67546  Long Nose and Needle Nose Pliers   

                               Rec 2                        Rec 3   Score 1  \
0           Grate Magnets & Housings      Indoor Furnitu