In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from gensim.models import Word2Vec 
import matplotlib.pyplot as plt
%matplotlib inline

import warnings;
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("transaction_data.csv")
df.head() 

Unnamed: 0,UserId,TransactionId,TransactionTime,ItemCode,ItemDescription,NumberOfItemsPurchased,CostPerItem,Country
0,278166,6355745,Sat Feb 02 12:50:00 IST 2019,465549,FAMILY ALBUM WHITE PICTURE FRAME,6,11.73,United Kingdom
1,337701,6283376,Wed Dec 26 09:06:00 IST 2018,482370,LONDON BUS COFFEE MUG,3,3.52,United Kingdom
2,267099,6385599,Fri Feb 15 09:45:00 IST 2019,490728,SET 12 COLOUR PENCILS DOLLY GIRL,72,0.9,France
3,380478,6044973,Fri Jun 22 07:14:00 IST 2018,459186,UNION JACK FLAG LUGGAGE TAG,3,1.73,United Kingdom
4,-1,6143225,Mon Sep 10 11:58:00 IST 2018,1733592,WASHROOM METAL SIGN,3,3.4,United Kingdom


In [3]:
df.shape

(1083818, 8)

In [4]:
df = df[df['NumberOfItemsPurchased'] > 0]

In [5]:
df = df[df['ItemCode'] > 0]

In [6]:
df = df[df['CostPerItem'] > 0]

In [7]:
df = df[df['UserId'] > 0]

In [8]:
df.shape

(792940, 8)

In [39]:
df['ItemCode']= df['ItemCode'].astype(str)

In [9]:
customers = df["UserId"].unique().tolist()
len(customers)

4334

In [10]:
# shuffle UserId's
random.shuffle(customers)

In [11]:
# extract 90% of UserId's
customers_train = [customers[i] for i in range(round(0.9*len(customers)))]

In [12]:
# split data into train and validation set
train_df = df[df['UserId'].isin(customers_train)]
validation_df = df[~df['UserId'].isin(customers_train)]

In [13]:
purchases_train = []

# populate the list with the ItemCodes
for i in customers_train:
    temp = train_df[train_df["UserId"] == i]["ItemDescription"].tolist()
    purchases_train.append(temp)

In [14]:
purchases_val = []

# populate the list with the product codes
for i in validation_df['UserId'].unique():
    temp = validation_df[validation_df["UserId"] == i]["ItemDescription"].tolist()
    purchases_val.append(temp)

In [15]:
# train word2vec model
model = Word2Vec( window = 10, sg = 1, hs = 0,
                 negative = 10, # for negative sampling
                 alpha=0.03, min_alpha=0.0007,
                 seed = 14)
model.build_vocab(purchases_train, progress_per=200)

model.train(purchases_train, total_examples = model.corpus_count, 
            epochs=10, report_delay=1)

(7030095, 7158680)

In [16]:
model.init_sims(replace=True)

In [17]:
print(model)

Word2Vec(vocab=3459, size=100, alpha=0.03)


In [18]:
X = model[model.wv.vocab]

X.shape

(3459, 100)

In [19]:
products = validation_df[["ItemCode", "ItemDescription"]]

# remove duplicates
#products.drop_duplicates(inplace=True, subset='ItemCode', keep="last")


In [20]:
def similar_products(v, n = 2):
    
    # extract most similar products for the input vector
    ms = model.similar_by_vector(v, topn= n+1)[1:]
    
    # extract name and similarity score of the similar products
    new_ms = []
    for ItemCode in ms:
        new_ms.append(ItemCode[0])
        
        
    return new_ms
   

In [23]:
itemdescriptions = train_df["ItemDescription"].unique().tolist()

In [24]:
len(itemdescriptions)

3856

In [25]:
#Trail
similar_products(model['LONDON BUS COFFEE MUG'])

['QUEENS GUARD COFFEE MUG', 'SOLDIERS EGG CUP ']

In [30]:
Item_Bought = input('Item Description')
print('RECOMMENDED PRODUCTS FOR', Item_Bought  , ':', similar_products(Item_Bought))

Item Description LONDON BUS COFFEE MUG


RECOMMENDED PRODUCTS FOR LONDON BUS COFFEE MUG : ['SOLDIERS EGG CUP ', 'PACK OF 12 LONDON TISSUES ']
