## Simple Recommendation System
### Import the libraries

In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

### Read the excel file

In [2]:
df = pd.read_excel('Online Retail.xlsx')
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


### Check for Missing values

In [6]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

### Drop the rows with missing values

In [7]:
df.dropna(inplace=True)

In [8]:
df['StockCode']= df['StockCode'].astype(str)

In [9]:
customers = df["CustomerID"].unique().tolist()
len(customers)

4372

In [10]:
# shuffle customer ID's
random.shuffle(customers)

# extract 90% of customer ID's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

# split data into train and validation set
train_df = df[df['CustomerID'].isin(customers_train)]
validation_df = df[~df['CustomerID'].isin(customers_train)]

### Purchase history of train and validation data

In [11]:
# list to capture purchase history of the customers
purchases_train = []

# populate the list with the product codes
for i in tqdm(customers_train):
    temp = train_df[train_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_train.append(temp)

100%|██████████| 3935/3935 [00:05<00:00, 766.94it/s] 


In [12]:
# list to capture purchase history of the customers
purchases_val = []

# populate the list with the product codes
for i in tqdm(validation_df['CustomerID'].unique()):
    temp = validation_df[validation_df["CustomerID"] == i]["StockCode"].tolist()
    purchases_val.append(temp)

100%|██████████| 437/437 [00:00<00:00, 1104.25it/s]


### Train the word2vec model

In [13]:
# train word2vec model
model = Word2Vec(window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)

model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(3620663, 3656120)

In [14]:
model.init_sims(replace=True)

In [15]:
print(model)

Word2Vec(vocab=3171, size=100, alpha=0.03)


In [16]:
# extract all vectors
X = model[model.wv.vocab]

X.shape

(3171, 100)

### Visualizing the word vectors

In [18]:
! pip install umap

Collecting umap
  Downloading https://files.pythonhosted.org/packages/4b/46/08ab68936625400fe690684428d4db4764f49b406782cc133df1d0299d06/umap-0.1.1.tar.gz
Building wheels for collected packages: umap
  Building wheel for umap (setup.py) ... [?25ldone
[?25h  Stored in directory: /Users/subashgandyer/Library/Caches/pip/wheels/7b/29/33/b4d917dc95f69c0a060e2ab012d95e15db9ed4cc0b94ccac26
Successfully built umap
Installing collected packages: umap
Successfully installed umap-0.1.1


In [19]:
import umap

cluster_embedding = umap.UMAP(n_neighbors=30, min_dist=0.0,
                              n_components=2, random_state=42).fit_transform(X)

plt.figure(figsize=(10,9))
plt.scatter(cluster_embedding[:, 0], cluster_embedding[:, 1], s=3, cmap='Spectral')

AttributeError: module 'umap' has no attribute 'UMAP'

In [20]:
products = train_df[["StockCode", "Description"]]

# remove duplicates
products.drop_duplicates(inplace=True, subset='StockCode', keep="last")

# create product-ID and product-description dictionary
products_dict = products.groupby('StockCode')['Description'].apply(list).to_dict()

In [21]:
# test the dictionary
products_dict['84029E']

['RED WOOLLY HOTTIE WHITE HEART.']

In [22]:
def similar_products(v, n = 6):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for j in ms:
        pair = (products_dict[j[0]][0], j[1])
        new_ms.append(pair)
        
    return new_ms

In [24]:
similar_products(model['84029E'])

[('KNITTED UNION FLAG HOT WATER BOTTLE', 0.8029211759567261),
 ('WHITE SKULL HOT WATER BOTTLE ', 0.7461369037628174),
 ('RETROSPOT HEART HOT WATER BOTTLE', 0.7449855804443359),
 ('SCOTTIE DOG HOT WATER BOTTLE', 0.7287868857383728),
 ('CHOCOLATE HOT WATER BOTTLE', 0.7008374929428101),
 ('GREY HEART HOT WATER BOTTLE', 0.6905593276023865)]

In [25]:
def aggregate_vectors(products):
    product_vec = []
    for i in products:
        try:
            product_vec.append(model[i])
        except KeyError:
            continue
        
    return np.mean(product_vec, axis=0)

In [26]:
len(purchases_val[0])

240

In [27]:
aggregate_vectors(purchases_val[0]).shape

(100,)

In [28]:
similar_products(aggregate_vectors(purchases_val[0]))

[('ALARM CLOCK BAKELIKE RED ', 0.6587485074996948),
 ('SET/5 RED RETROSPOT LID GLASS BOWLS', 0.6519782543182373),
 ('LUNCH BAG RED RETROSPOT', 0.6518964171409607),
 ('JUMBO BAG RED RETROSPOT', 0.6513478755950928),
 ('LUNCH BAG CARS BLUE', 0.6384843587875366),
 ('SPACEBOY LUNCH BOX ', 0.6379221677780151)]

In [29]:
similar_products(aggregate_vectors(purchases_val[0][-10:]))

[('ROUND SNACK BOXES SET OF 4 FRUITS ', 0.7679789066314697),
 ('LUNCH BAG WOODLAND', 0.6775963306427002),
 ('SPACEBOY LUNCH BOX ', 0.6712849140167236),
 ('SET OF 3 BUTTERFLY COOKIE CUTTERS', 0.6544245481491089),
 ('PICNIC BOXES SET OF 3 RETROSPOT ', 0.652035653591156),
 ('CIRCUS PARADE LUNCH BOX ', 0.6471738219261169)]