In [1]:
# this notebook loads and manipulates E-Corp's 2-week dataset that was given to us, uses
# collaborative filtering to train an item-based rec engine, generates and saves the
# recommendations into Excel

__author__ = 'Josh Bacher'
__email__ = 'bacherjp@gmail.com'
__website__ = 'https://github.com/jpbacher'


import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
import warnings
warnings.filterwarnings('ignore')


class Data:

    def __init__(self, file, basket_size):
        '''initiate Data class & retrieve orders that are greater than or equal to basket size'''
        self.basket_size = basket_size
        self.data = self._get_data(file)

    def _get_data(self, df, preprocess=True):

        df = self._read_data(file)
        if preprocess:
            df = self._discard_basket_size(df, self.basket_size)
            df = self._ohe(df)
            df = self._remove_features(df)
            df = self._combine_orders(df)
        return df

    def _read_data(self, file):

        return pd.read_csv(file, sep='\t')

    def _discard_basket_size(self, df,  basket_size, order_feat='order_number'):
        '''drop orders less than the basket size'''
        return df.groupby(order_feat).filter(lambda x: len(x) >= basket_size)


    def _ohe(self, df, feat='l3'):
        '''one-hot encodes the lowest level featue & concatenates back to the original dataframe'''
        dummy = pd.get_dummies(df[feat])
        return pd.concat([df, dummy], axis=1)

    def _remove_features(self, df, features=['l1', 'l2', 'l3', 'sku', 'brand']):
        '''remove all original features except order_number (the user feature)'''
        return df.drop(features, axis=1)

    def _combine_orders(self, df, order_feat='order_number'):
        '''combine each order into single instance & sum all features except order number'''
        return df.groupby(order_feat).sum().reset_index()

In [7]:
class Recommend():

    def __init__(self, data, user_feat, item_feats, similarity_metric='jaccard'):
        '''initiate Recommend class'''
        self.data = data
        self.user_feat = user_feat
        self.item_feats = item_feats
        self.similarity_metric = similarity_metric
        self.sim_matrix = self._generate_sim_matrix(self.similarity_metric)
        self.user_scores = self._retrieve_user_scores(self.sim_matrix)
        self.recs = []

    def _generate_sim_matrix(self, similarity_metric):

        similarities = 1 - pdist(self.data[self.item_feats].T, metric=metric)
        sim_matrix = squareform(similarities)
        sim_matrix = pd.DataFrame(
            similarities, index=self.item_feats, columns=self.item_feats)
        return sim_matrix

    def _retrieve_user_scores(self, sim_matrix):

        users = self.data.loc[:, self.user_feat]
        feats = [self.user_feat] + [self.item_feats]
        user_df = self.data.loc[:, feats].set_index(self.user_feat)
        #user_scores = pd.DataFrame(index=users, columns=self.item_feats)
        user_scores = user_df.dot(sim_matrix)
        return user_scores

In [2]:
file = "/mnt/c/projects/ecorp-recommendations/data/alltransactions.txt"
basket_size = 20

data = Data(file, basket_size)

user_feat = 'order_number'
item_feats = list(data.data.columns)
item_feats.remove(user_feat)
    
rec = Recommend(data.data, user_feat, item_feats, cf_method='item', similarity='jaccard')

In [4]:
data.data[item_feats].head()

Unnamed: 0,12 Volt Accessories,12-Point Flange Head Cap Screws,3-Ring Binder Accessories,3-Ring Binders,3.3 Inch Diameter Motors,4.4 Inch Diameter Motors,5 X 20mm Glass and Ceramic Fuses,5S Red Tag Stations,A/C Mounting Pads,A/C Refrigeration Accessories,...,Worker Emergency Identification,Worm Gear Clamps,Wrap-a-Round Tape Measures,Wrist Rests and Palm Supports,Wrist Supports and Wraps,Y Strainers,Yard Hydrants,Zone Valve Actuators,Zone Valves,pH Meters
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
data.data.shape

(7453, 3790)

In [11]:
dt = data.data[item_feats].T
dt.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7443,7444,7445,7446,7447,7448,7449,7450,7451,7452
12 Volt Accessories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12-Point Flange Head Cap Screws,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3-Ring Binder Accessories,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3-Ring Binders,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3.3 Inch Diameter Motors,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
sim_matrix_t = 1 - pdist(data.data[item_feats].T, metric='jaccard')

In [9]:
sim_matrix_t.shape

(7176366,)

In [12]:
jac_sim_matrix = squareform(sim_matrix_t)

In [13]:
jac_sim_matrix.shape

(3789, 3789)

In [15]:
jac_sim_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
jac_sim_df = pd.DataFrame(jac_sim_matrix, index=item_feats, columns=item_feats)

In [18]:
jac_sim_df.head()

Unnamed: 0,12 Volt Accessories,12-Point Flange Head Cap Screws,3-Ring Binder Accessories,3-Ring Binders,3.3 Inch Diameter Motors,4.4 Inch Diameter Motors,5 X 20mm Glass and Ceramic Fuses,5S Red Tag Stations,A/C Mounting Pads,A/C Refrigeration Accessories,...,Worker Emergency Identification,Worm Gear Clamps,Wrap-a-Round Tape Measures,Wrist Rests and Palm Supports,Wrist Supports and Wraps,Y Strainers,Yard Hydrants,Zone Valve Actuators,Zone Valves,pH Meters
12 Volt Accessories,0.0,0.0,0.0,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12-Point Flange Head Cap Screws,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-Ring Binder Accessories,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-Ring Binders,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.3 Inch Diameter Motors,0.066667,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
user_df = data.data.set_index(user_feat)

In [21]:
user_scores = user_df.dot(jac_sim_df)

In [23]:
user_scores.head()

Unnamed: 0_level_0,12 Volt Accessories,12-Point Flange Head Cap Screws,3-Ring Binder Accessories,3-Ring Binders,3.3 Inch Diameter Motors,4.4 Inch Diameter Motors,5 X 20mm Glass and Ceramic Fuses,5S Red Tag Stations,A/C Mounting Pads,A/C Refrigeration Accessories,...,Worker Emergency Identification,Worm Gear Clamps,Wrap-a-Round Tape Measures,Wrist Rests and Palm Supports,Wrist Supports and Wraps,Y Strainers,Yard Hydrants,Zone Valve Actuators,Zone Valves,pH Meters
order_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
66334,0.296831,0.0,0.013941,0.003305,0.025079,0.007692,0.148153,0.0,0.040447,0.089028,...,0.0,0.33433,0.072485,0.012658,0.078101,0.036061,0.201027,0.0,0.0,0.0
66361,1.694701,0.0,0.336942,0.015711,0.689775,0.034719,0.303933,0.0,38.748221,0.434013,...,0.006173,0.84629,0.194914,0.174049,0.104742,0.215847,1.435154,0.0,0.064432,0.096774
66619,0.020864,0.0,0.081766,0.02435,0.027946,0.003704,0.142401,0.0,0.01717,0.177677,...,0.0,0.61326,0.095208,0.022876,0.090109,0.048569,0.02793,0.0,0.002137,0.029314
66768,0.00232,0.0,0.00386,0.012406,0.002268,0.0,0.091533,0.0,0.0,0.09845,...,0.001566,0.33233,0.022059,0.0,0.062211,0.013679,0.0,0.001566,0.0,0.001564
66849,0.009313,0.0,0.0,0.006098,0.002268,0.0,0.083522,0.0,0.01054,0.080748,...,0.009826,0.254528,0.029285,0.0,0.092347,0.019546,0.00431,0.003653,0.0,0.02092


In [24]:
users = data.data.loc[:, user_feat]

feats = ['Rec ' + str(x) for x in range(1, 6)] + ['Score ' + str(x) for x in range(1, 6)]

recs_df = pd.DataFrame(index=users, columns=feats)

In [28]:
recs_df.head()

Unnamed: 0_level_0,Rec 1,Rec 2,Rec 3,Rec 4,Rec 5,Score 1,Score 2,Score 3,Score 4,Score 5
order_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
66334,Ice Totes,Grate Magnets & Housings,Indoor Furniture Covers,Curtain Walls,Solenoid Valve Manifolds,6.41002,6.41002,6.41002,6.41002,6.41002
66361,Artificial Trees,Borescope Accessories,Flag Poles,Side Mount Casters,Drill-Driven Pump Accessories,38.7482,38.7482,38.7482,38.7482,38.7482
66619,Toilet Paper,"Paper Towels, Rolls",Disposable Towels and Wipes,Degreasers,Wet Mops,3.15836,3.0085,2.35317,2.14513,2.07687
66768,Safety Glasses,Disposable Gloves,Leather Work Gloves,Ear Plugs,Disposable Respirators,1.49041,1.38176,1.33632,1.3356,1.04987
66849,Ear Plugs,Disposable Respirators,Safety Utility Knives,Disposable Gloves,Tape Measures,1.29818,1.1469,1.13463,1.13424,1.08323


In [29]:
for customer in users[:5]:
    items_sorted = user_scores.sort_values(
        by=customer, ascending=False, axis=1).loc[customer, :].index
    for i in range(5):
        item = items_sorted[i]
        item_feat = feats[i]
        score_feat = feats[i + 5]
        recs_df.loc[customer, item_feat] = item
        recs_df.loc[customer, score_feat] = user_scores.loc[customer, item]

recs_df.reset_index(inplace=True, drop=False)

In [30]:
recs_df

Unnamed: 0,order_number,Rec 1,Rec 2,Rec 3,Rec 4,Rec 5,Score 1,Score 2,Score 3,Score 4,Score 5
0,66334,Ice Totes,Grate Magnets & Housings,Indoor Furniture Covers,Curtain Walls,Solenoid Valve Manifolds,6.41002,6.41002,6.41002,6.41002,6.41002
1,66361,Artificial Trees,Borescope Accessories,Flag Poles,Side Mount Casters,Drill-Driven Pump Accessories,38.7482,38.7482,38.7482,38.7482,38.7482
2,66619,Toilet Paper,"Paper Towels, Rolls",Disposable Towels and Wipes,Degreasers,Wet Mops,3.15836,3.0085,2.35317,2.14513,2.07687
3,66768,Safety Glasses,Disposable Gloves,Leather Work Gloves,Ear Plugs,Disposable Respirators,1.49041,1.38176,1.33632,1.3356,1.04987
4,66849,Ear Plugs,Disposable Respirators,Safety Utility Knives,Disposable Gloves,Tape Measures,1.29818,1.1469,1.13463,1.13424,1.08323
5,66883,Toilet Paper,"Paper Towels, Rolls",Trash Bags,Degreasers,Body and Hand Wipes,4.23266,3.86752,3.23542,3.13546,3.06865
6,66916,Locking Plier Sets,Punch and Chisel Sets,Pry Bar Sets,Screwdriver Sets,Plier Sets,1.42377,1.38152,1.33922,1.28472,1.28409
7,67077,Disposable Towels and Wipes,Abrasive Cut-Off and Chop Wheels,Disposable Respirators,Faceshield Replacement Visors,Duct and Cloth Tapes,1.40323,1.38365,1.37503,1.35894,1.34903
8,67226,Toilet Paper,"Paper Towels, Rolls",Tape Measures,Handheld Flashlights,Duct and Cloth Tapes,5.70835,5.44131,5.29775,5.12193,5.09441
9,67250,Cable Tray Accessories,Gas Infrared Tube Heaters,Flat File Cabinet Bases,Bin Shelving,Self-Priming Pumps,3.974,3.974,3.974,2.27875,2.22663
