## Connect to google drive

In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


## Set up the working directory 

In [2]:
import os
os.chdir('/content/drive/MyDrive/PROJET_PERSO/Word2Vec_for_RecoSystems')

## Read & Process data

In [3]:
import pandas as pd
import numpy as np
import random

In [4]:
df = pd.read_excel('Online Retail.xlsx')

In [5]:
df.head(5)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [6]:
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [7]:
#CONVERT TYPES
df['InvoiceNo']= df['InvoiceNo'].astype(str)
df['StockCode']= df['StockCode'].astype(str)
df['Description']= df['Description'].astype(str)
df['Country']= df['Country'].astype(str)

In [8]:
# DATASET SHAPE
df.shape

(541909, 8)

In [9]:
#PRINT ALL ROWS WITH MISSING VALUES
df[df.isnull().any(axis=1)]

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,2010-12-01 11:52:00,0.00,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,2010-12-01 14:32:00,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,2010-12-01 14:32:00,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,2010-12-01 14:32:00,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,2010-12-01 14:32:00,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
541536,581498,85099B,JUMBO BAG RED RETROSPOT,5,2011-12-09 10:26:00,4.13,,United Kingdom
541537,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,2011-12-09 10:26:00,4.13,,United Kingdom
541538,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,2011-12-09 10:26:00,4.96,,United Kingdom
541539,581498,85174,S/4 CACTI CANDLES,1,2011-12-09 10:26:00,10.79,,United Kingdom


In [10]:
#DELETE ALL MISSING VALUES' ROWS
df.dropna(inplace=True)

#SORT DATAFRAME BY INVOICE DATE
df.sort_values("InvoiceDate", inplace=True)

## Create sequences

In [11]:
all_customers = df['CustomerID'].unique().tolist() #retrieve unique customerID inside a list
print(f'There is {len(all_customers)} customers, so we can create {len(all_customers)} sequences.')

There is 4372 customers, so we can create 4372 sequences.


In [12]:
#We split our data into a training and a test set respictively 90% and 10% of our dataset.
#I usually use 80/20 split but as our model will only works for know product, we should

train_customers_id = random.sample(all_customers, k=int(len(all_customers)*0.9))
test_customers_id = list(set(all_customers) - set(train_customers_id))
print(f'Training set size : {len(train_customers_id)}')
print(f'Test set size : {len(test_customers_id)}')

Training set size : 3934
Test set size : 438


In [13]:
#Create a train/valid dataframe and train/valid orders sequences
#To do so, we copy rows from original dataframe where the customerID is inside the train/valid id list

train_df = df[df['CustomerID'].isin(train_customers_id)].copy()
train_seq = train_df.groupby("CustomerID")["StockCode"].apply(list).tolist()

valid_df = df[df['CustomerID'].isin(test_customers_id)].copy()
valid_seq = valid_df.groupby("CustomerID")["StockCode"].apply(list).tolist()

In [14]:
#A function to compute the average number of orders
def avg_nb_orders(L):
  sum = 0
  for l in L:
    sum += len(l)
  return sum/len(L)

print(f'The average numbers of orders beyong customers in training dataframe is around {int(avg_nb_orders(train_seq))}')

The average numbers of orders beyong customers in training dataframe is around 93


## Create product dictionnary

In [15]:
#Keys are productID and values are product description
all_products = df.groupby("StockCode")["Description"].unique().apply(lambda x: x[0]).to_dict()
print(f'There is {len(all_products)} products')

There is 3684 products


## Train the Word2Vec Model

In [16]:
from gensim.models import Word2Vec 
from gensim.models.callbacks import CallbackAny2Vec

In [17]:
class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 1
        self.previous_loss = 0.0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        current_loss = loss - self.previous_loss
        self.previous_loss = loss
        print('Loss after epoch {}: {}'.format(self.epoch, current_loss))
        self.epoch += 1

In [18]:
#Model implementation using skip-gram mode
model = Word2Vec(window=8, sg=1, hs=0, alpha=0.01, min_alpha=0.00001,
                 negative=10, seed=11)

model.build_vocab(train_seq, progress_per=500)

model.train(train_seq, total_examples=model.corpus_count, epochs=70,
            report_delay=1, compute_loss=True, callbacks=[callback()])

Loss after epoch 1: 3509557.0
Loss after epoch 2: 2521508.5
Loss after epoch 3: 2384193.5
Loss after epoch 4: 2321571.0
Loss after epoch 5: 2294568.0
Loss after epoch 6: 2160674.0
Loss after epoch 7: 2106950.0
Loss after epoch 8: 2038618.0
Loss after epoch 9: 2136386.0
Loss after epoch 10: 2028966.0
Loss after epoch 11: 1848842.0
Loss after epoch 12: 2040762.0
Loss after epoch 13: 2124436.0
Loss after epoch 14: 2127078.0
Loss after epoch 15: 2072222.0
Loss after epoch 16: 2067580.0
Loss after epoch 17: 2193312.0
Loss after epoch 18: 2041504.0
Loss after epoch 19: 2171128.0
Loss after epoch 20: 2020788.0
Loss after epoch 21: 2233252.0
Loss after epoch 22: 2097748.0
Loss after epoch 23: 2018780.0
Loss after epoch 24: 2188412.0
Loss after epoch 25: 2150832.0
Loss after epoch 26: 2172716.0
Loss after epoch 27: 2075176.0
Loss after epoch 28: 2212656.0
Loss after epoch 29: 2029440.0
Loss after epoch 30: 2259052.0
Loss after epoch 31: 1493844.0
Loss after epoch 32: 108200.0
Loss after epoch 3

(25608123, 25858560)

In [19]:
#Replace vectors by their L2-normalization
model.wv.init_sims(replace=True)

In [20]:
#Get words' vector shape. We have 3174 unique word, and each of them has a length equals to 100
model.wv[model.wv.vocab].shape

(3174, 100)

## Define our similarity measure

In [21]:
def cosine_distance(vec1, vec2):
  return np.dot(vec1, vec2)/(np.linalg.norm(vec1)*np.linalg.norm(vec2))

In [22]:
def compute_similarity(model, topn=5, input_key=None, product_vec=None):
  all_similarities = []
  if not input_key and product_vec is None : return 0

  elif product_vec is None:
    product_vec = model.wv[input_key]

  for key, product_name in all_products.items():
    try:
      dist = cosine_distance(product_vec, model.wv[key])
      if dist != 1: all_similarities.append((product_name, dist))
    except:
      pass
  
  return sorted(all_similarities, key=lambda x: x[1], reverse=True)[:topn]


## Create a sequence vector

In [23]:
def seq_avg_vector(products):
    avg_vec = []
    for product in products:
        try:
          avg_vec.append(model.wv[product])
        except:
          pass
        
    return np.mean(avg_vec, axis=0)

## Print functions

In [24]:
def product_recommender(model, key, nb=3):
  print(f'For the product "{all_products[key]}" here is the selected recommendations :')
  recommendations = compute_similarity(model, input_key=key, topn=nb)
  for item in recommendations:
    print(f'\t - {item[0]}')

In [25]:
def sequence_recommender(model, orders_list, nb=3):
  print("Here is the product ordered in this sequence:")
  for item in orders_list:
    print(all_products[item])
  print(f'\nFor this history of products here is the selected recommendations :')
  recommendations = compute_similarity(model, product_vec=seq_avg_vector(orders_list), topn=nb)
  for item in recommendations:
    print(f'\t - {item[0]}')

## Test our model

In [26]:
product_recommender(model,'23256')

For the product "CHILDRENS CUTLERY SPACEBOY " here is the selected recommendations :
	 - CHILDRENS CUTLERY SPACEBOY 
	 - CHILDRENS CUTLERY DOLLY GIRL 
	 - CHILDRENS CUTLERY CIRCUS PARADE


In [39]:
#RECOMMENDATIONS ACCORDING TO THE 5 LAST ORDERS
sequence_recommender(model, valid_seq[243][-5:])

Here is the product ordered in this sequence:
LARGE WHITE HEART OF WICKER
EASTER DECORATION SITTING BUNNY
WHITE HANGING HEART T-LIGHT HOLDER
SET 6 SCHOOL MILK BOTTLES IN CRATE
LARGE WHITE HEART OF WICKER

For this history of products here is the selected recommendations :
	 - LARGE WHITE HEART OF WICKER
	 - SMALL WHITE HEART OF WICKER
	 - HEART OF WICKER LARGE
