<p>Split the data to a train and a test set (take 10% of customers as test) and recommend 5 products to all of them based on their first purchase.

The goal is to build our own Word2Vec model using the vocabulary from column StockCode. We consider different StockCodes which were bought by the </br>
same customer as one "sentence". Example of a sentence: ["85123A","71053","84406B","84029G"].
</p>

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data1 = pd.read_csv('Year 2009-2010-Table 1.csv')
data1.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/1/09 7:45,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/1/09 7:45,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/1/09 7:45,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/1/09 7:45,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/1/09 7:45,1.25,13085.0,United Kingdom


In [4]:
data1.shape

(525461, 8)

In [5]:
data2 = pd.read_csv('Year 2010-2011-Table 1.csv')
data2.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/10 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/10 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/10 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/10 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/10 8:26,3.39,17850.0,United Kingdom


In [6]:
data2.shape

(541910, 8)

In [7]:
df = pd.concat([data1, data2], ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   Invoice      1067371 non-null  object 
 1   StockCode    1067371 non-null  object 
 2   Description  1062989 non-null  object 
 3   Quantity     1067371 non-null  int64  
 4   InvoiceDate  1067371 non-null  object 
 5   Price        1067371 non-null  float64
 6   Customer ID  824364 non-null   float64
 7   Country      1067371 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 65.1+ MB


In [8]:
df.shape

(1067371, 8)

In [9]:
group = df.groupby('Customer ID')['StockCode'].agg(list).reset_index()
group_df = pd.DataFrame(group, columns=['Customer ID', 'StockCode'])
group_df

Unnamed: 0,Customer ID,StockCode
0,12346.0,"[TEST001, TEST001, TEST001, TEST001, TEST002, ..."
1,12347.0,"[22698, 22699, 20985, 22418, 22422, 51014A, 51..."
2,12348.0,"[21213, 84991, 22951, 84992, 21977, 84988, 849..."
3,12349.0,"[22072, 20914, 21231, 21232, 20747, 22554, 225..."
4,12350.0,"[21908, 22412, 79066K, 79191C, 22348, 84086C, ..."
...,...,...
5937,18283.0,"[20971, 84836, 85123A, 22069, 20969, 22274, 21..."
5938,18284.0,"[C2, 22591, 22187, 21807, 21805, 21817, 21819,..."
5939,18285.0,"[21752, 22182, 20802, 21313, 21656, 21666, 221..."
5940,18286.0,"[72801C, 22178, 37503, 21265, 21323, 79323G, 7..."


In [10]:
X = df['Customer ID']
y = df['StockCode']

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [92]:
# Compute the TF-IDF matrix for item descriptions
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(group_df['StockCode'])

# Compute the cosine similarity matrix for items
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [99]:
def recommender(data, customer_id):
    data['StockCode'] = data['StockCode'].astype(str).str.strip().str.lower()

    customer_data = data[data['Customer ID'] == customer_id]
    customer_items = customer_data['StockCode'].explode().unique().tolist()

    stock_to_index = pd.Series(data.index, index=data['StockCode']).drop_duplicates()
    #print(stock_to_index)
    
    sim_scores = []
    #print(customer_items)
    for item in customer_items:
       # print(item)
        if item in stock_to_index:
            #print('Yes')
            idx = stock_to_index[item]
            #print(idx)
            scores = list(enumerate(cosine_sim[idx]))
            sim_scores.extend(scores)
           # print(sim_scores)
    # Remove duplicate items and items the customer has already bought
    sim_scores = list(set(sim_scores))
    sim_scores = [score for score in sim_scores if data.iloc[score[0]]['StockCode'] not in customer_items]

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the n most similar items
    sim_scores = sim_scores[1:6]

    # Get the item indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top n most similar items

    recommended_items = data.iloc[item_indices][['StockCode']].reset_index(drop=True)
    return recommended_items

In [100]:
recommender(group_df,13085)

Unnamed: 0,StockCode
0,"['22041', '21137', '22168']"
1,"['21137', '22041', '22561', '21137', '22041', ..."
2,"['22109', '22107', '21625', '21622', '21232', ..."
3,"['72741', '72756', '21314', '22109', '21137', ..."
4,"['22468', '21137', '82001s', '22191', '22194',..."


In [37]:
df.columns

Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')