In [2]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import gc
import random
from sklearn.model_selection import train_test_split
import turicreate as tc
from tqdm.notebook import tqdm


In [102]:
article_subset_df=pd.read_csv('articles_subset.csv')
customers_df=pd.read_csv('customers_subset.csv')
transaction_subset_df=pd.read_csv('transaction_subset.csv')


Unnamed: 0.1,Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,114,2018-09-20,0091c0e943c92167ea78f6ac180636be4098b06c1a066c...,677341001,0.080492,2
1,115,2018-09-20,00923f88cda50cbea4c5ceb2bd3467c620a9d74cb7da95...,564786020,0.019814,1
2,116,2018-09-20,00923f88cda50cbea4c5ceb2bd3467c620a9d74cb7da95...,655347002,0.015237,1
3,117,2018-09-20,00923f88cda50cbea4c5ceb2bd3467c620a9d74cb7da95...,655347007,0.015237,1
4,135,2018-09-20,00b5b1446be091d4913765cf2cc01edc7e0d3820e1b8a4...,627300001,0.024136,2


In [106]:
def create_predictions_format(data):
    data["article_id"] = data["article_id"]#.apply(lambda x: adjust_id(x))
    all_preds = data.groupby("customer_id")["article_id"].unique().to_dict()
    data["preds"] = data["customer_id"].map(all_preds)
    data["preds"] = data["preds"].apply(lambda x: " ".join([str(y) for y in x]))
    data = data.groupby("customer_id")["preds"].first().reset_index()
    return data
def get_frequent_purchases(transactions, n=50):
    temp = transactions.groupby(["customer_id", "article_id"])["t_dat"].count().reset_index()
    temp.columns = ["customer_id", "article_id", "count"]
    temp2 = transactions.groupby(["customer_id"])["t_dat"].count().reset_index()
    temp2.columns = ["customer_id", "full_count"]
    temp = temp.merge(temp2, on="customer_id", how="left")
    temp["perc"] = (temp["count"] / temp["full_count"])*100
    temp = temp[temp["perc"] >= n].reset_index(drop=True)
    temp = create_predictions_format(temp)
    return temp

In [107]:
transactions=pd.DataFrame()
transactions['customer_id'] = transaction_subset_df['customer_id'].str[-16:]
transactions['t_dat'] = pd.to_datetime(transaction_subset_df['t_dat'])
transactions = transaction_subset_df[['t_dat','customer_id','article_id']]

In [108]:
TOP_CUSTOMERS = 300000
TOP_N = 200

In [109]:
most_frequent_articles = transactions["article_id"].value_counts().reset_index()
most_frequent_articles.columns = ["article_id", "count"]
print("Unique IDs from subset Transactions:", len(most_frequent_articles))
print("Selected Unique IDs:", TOP_N)
most_frequent_articles = np.asarray(most_frequent_articles.head(TOP_N)["article_id"])
transactions = transactions[transactions["article_id"].isin(most_frequent_articles)].reset_index(drop=True)

Unique IDs from subset Transactions: 86307
Selected Unique IDs: 200


In [110]:
customers_top_trans = list(transactions["customer_id"].value_counts().reset_index().head(TOP_CUSTOMERS)["index"].unique())
transactions = transactions[transactions["customer_id"].isin(customers_top_trans)].reset_index(drop=True)
print("Unique users to recommend:", transactions["customer_id"].nunique())

Unique users to recommend: 63170


In [111]:
temp = get_frequent_purchases(transactions, n=50)
temp.head(12)

Unnamed: 0,customer_id,preds
0,0001076e215991bad544dd3e7312f78d9f576a1cc3ddc4...,715624001
1,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,739144004
2,000226b9ea81019249060b376b516f821a80e9b24f89a7...,590928001 759871002
3,00022754ec18c5e53757eea8b281632a5c4a499368ecc5...,573716012 720125001
4,0005c68366e7955683b7cdc75535f400d76c2e713b6277...,741356002
5,0006ca2c5ecdbdcc1de5e72adffc6c33448fc2aaa62afe...,448509001 572998001
6,0006d8845db202dc43bd6ed3d74702df026b13fc1517c9...,611415001 611415005
7,000c5c714aefd0d5ed1205e2781070167826ffc117ab9e...,399256005 579541001
8,000c6acfc0457b99ae3150e8e57beffb4c8d7c5fc31e40...,579541001 803757001
9,000ce5da167c6c8dfaea6dfc4b59a5ea3217630ec36cfc...,618800001 741356002


In [112]:
del most_frequent_articles, customers_top_trans

In [113]:
gc.collect()

146

In [114]:
train = transactions.groupby(["customer_id","article_id"])["t_dat"].count().reset_index()
train.columns = ["customer_id","article_id", "purchase_count"]
train.head()

Unnamed: 0,customer_id,article_id,purchase_count
0,0001076e215991bad544dd3e7312f78d9f576a1cc3ddc4...,715624001,1
1,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,739144004,1
2,000226b9ea81019249060b376b516f821a80e9b24f89a7...,590928001,1
3,000226b9ea81019249060b376b516f821a80e9b24f89a7...,759871002,1
4,00022754ec18c5e53757eea8b281632a5c4a499368ecc5...,573716012,1


In [115]:
dummy_train = train.copy()
dummy_train['purchase_dummy'] = 1
dummy_train.head()

Unnamed: 0,customer_id,article_id,purchase_count,purchase_dummy
0,0001076e215991bad544dd3e7312f78d9f576a1cc3ddc4...,715624001,1,1
1,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,739144004,1,1
2,000226b9ea81019249060b376b516f821a80e9b24f89a7...,590928001,1,1
3,000226b9ea81019249060b376b516f821a80e9b24f89a7...,759871002,1,1
4,00022754ec18c5e53757eea8b281632a5c4a499368ecc5...,573716012,1,1


In [116]:
def normalize_data(data):
    df_matrix = pd.DataFrame(pd.pivot(data, columns="article_id",index="customer_id", values="purchase_count"))
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    final = pd.melt(d, id_vars=['customer_id'], value_name='scaled_purchase_freq').dropna()
    final.columns = ["customer_id", "article_id", "scaled_purchase_freq"]
    final = final.reset_index(drop=True)
    return final

In [117]:
norm_train = normalize_data(data=train)
norm_train.head()

Unnamed: 0,customer_id,article_id,scaled_purchase_freq
0,00a758653e07ebcdfcb1ae052e5d0ae6a5f7cf40434154...,108775015,0.0
1,00ada42f83a04a8c02035021f31c83989c2d1b65286298...,108775015,0.333333
2,00c3c6a5e9ed6dc59a7e4b61b0d506ae2cdabceea54d39...,108775015,0.0
3,01029cacbcf5a8e6de8d9f523eabd26d0182b92470f911...,108775015,0.0
4,01be7a105a8038955fd7cec2bf80333c32ef031cfa3868...,108775015,0.0


In [118]:
def split_data(data):
    train, test = train_test_split(data, test_size=0.3)
    train_data =train
    test_data = test
    return train_data, test_data

In [119]:
train_data, test_data = split_data(train)
train_data_dummy, test_data_dummy = split_data(dummy_train)

In [120]:
train_data_norm, test_data_norm = split_data(norm_train)

In [121]:
user_id = 'customer_id'
item_id = 'article_id'
users_to_recommend = list(train["customer_id"].unique())

In [122]:
train_data.article_id=train_data.article_id.astype(str)
train.article_id=train.article_id.astype(str)
dummy_train.article_id=dummy_train.article_id.astype(str)

In [123]:
train_data.head()

Unnamed: 0,customer_id,article_id,purchase_count
35079,3258fbad7a66115c3f0bab2019b56a3da2ff52e7a42c95...,554450027,1
15515,164b123cdadd26ba7da4612338ea7d47c92a6a996b5a28...,749699002,1
119776,af1bcf766b6d2e789afe16d855ea828d99ce39c387b6df...,827968001,1
64453,5daac491927b06f166a3dd99b621f2bb95d571e868ee89...,778064003,1
152625,df42dad681c22d1826ab68c6f301e9e4cedbbb4c42f727...,448509014,1


In [124]:
train_data_sort = train_data.sort_values(['purchase_count', 'article_id'], ascending = [0,1]) 
train_data_sort['rank'] = train_data_sort['purchase_count'].rank(ascending=0, method='first') 
popularity_recommendations = train_data_sort.head(12) 
popularity_recommendations 

Unnamed: 0,customer_id,article_id,purchase_count,rank
124928,b63c5a189689b32199e86cc4f56f9605fac682c309105d...,156231001,16,1.0
100414,92b38def6994e83c8f27544a355bb063168a0d6dc53507...,399223001,16,2.0
13748,139e15adb3037711686c19315086da44a9311b4538aa02...,228257001,13,3.0
120185,af98ad7845efdfc5dd9c2af6c0f2e1cc6d982ef8b14d69...,156231001,12,4.0
38632,37ccc4a1cb17734a6d427baed6063d636250ca665cd0f8...,228257001,11,5.0
92662,873fae8e3de7f0346508563e49a5381559cdf11569dac5...,706016001,11,6.0
164300,f1084d39bf4523973f99bcae64c413e7b12eec8005c464...,111593001,10,7.0
154569,e27f5f198445af34c4411f7451479b1f6b90c17edda6d9...,228257001,10,8.0
143211,d190fc2dc41e27e5f79f8b1f58bfcd7a13ab22857f39ca...,706016001,10,9.0
86614,7e515c061d0a3cea87ef6fcb7ad30193b365950f8b9176...,156231001,9,10.0


In [125]:
def recommend(user_id):     
    user_recommendations = popularity_recommendations 
    user_recommendations['userId'] = user_id 
    cols = user_recommendations.columns.tolist() 
    cols = cols[-1:] + cols[:-1] 
    user_recommendations = user_recommendations[cols] 
    return user_recommendations 

In [126]:
random_users = [1,11,111]
for i in random_users:
    print("Recommendations for userId:",(i)," \n ")
    print(recommend(i))    
    print("\n") 

Recommendations for userId: 1  
 
        userId                                        customer_id article_id  \
124928       1  b63c5a189689b32199e86cc4f56f9605fac682c309105d...  156231001   
100414       1  92b38def6994e83c8f27544a355bb063168a0d6dc53507...  399223001   
13748        1  139e15adb3037711686c19315086da44a9311b4538aa02...  228257001   
120185       1  af98ad7845efdfc5dd9c2af6c0f2e1cc6d982ef8b14d69...  156231001   
38632        1  37ccc4a1cb17734a6d427baed6063d636250ca665cd0f8...  228257001   
92662        1  873fae8e3de7f0346508563e49a5381559cdf11569dac5...  706016001   
164300       1  f1084d39bf4523973f99bcae64c413e7b12eec8005c464...  111593001   
154569       1  e27f5f198445af34c4411f7451479b1f6b90c17edda6d9...  228257001   
143211       1  d190fc2dc41e27e5f79f8b1f58bfcd7a13ab22857f39ca...  706016001   
86614        1  7e515c061d0a3cea87ef6fcb7ad30193b365950f8b9176...  156231001   
56037        1  517038efba162fb653467f9f6d0c777079a61380010b27...  156231001   
68253 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recommendations['userId'] = user_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recommendations['userId'] = user_id
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_recommendations['userId'] = user_id


In [127]:
user_id = 'customer_id'
item_id = 'article_id'
users_to_recommend = list(train["customer_id"].unique())

In [128]:
def train_model(train_data, name, user_id, item_id, target, users_to_recommend):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                 user_id=user_id, 
                                                 item_id=item_id, 
                                                 target=target, verbose=False)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                      user_id=user_id, 
                                                      item_id=item_id, 
                                                      target=target,
                                                      similarity_type='cosine', verbose=False)
    elif name == 'pearson':
            model = tc.item_similarity_recommender.create(train_data, 
                                                          user_id=user_id, 
                                                          item_id=item_id, 
                                                          target=target, 
                                                          similarity_type='pearson', verbose=False)
    recom = model.recommend(users=users_to_recommend, k=12, verbose=False)
    return model, recom

In [129]:

name = 'popularity'
target = 'purchase_count'
popularity_count, _ = train_model(tc.SFrame(train_data), name, user_id, item_id, target, users_to_recommend)

In [130]:
name = 'popularity'
target = 'purchase_dummy'
popularity_dummy, _ = train_model(tc.SFrame(train_data_dummy), name, user_id, item_id, target, users_to_recommend)

In [131]:
name = 'popularity'
target = 'scaled_purchase_freq'
popularity_normalize, _ = train_model(tc.SFrame(normalize_data(train_data)), name, user_id, item_id, target, users_to_recommend)

In [132]:
name = 'cosine'
target = 'purchase_count'
cosine_count, _ = train_model(tc.SFrame(train_data), name, user_id, item_id, target, users_to_recommend)

In [133]:
name = 'cosine'
target = 'purchase_dummy'
cosine_dummy, _ = train_model(tc.SFrame(train_data_dummy), name, user_id, item_id, target, users_to_recommend)

In [134]:
name = 'cosine'
target = 'scaled_purchase_freq'
cosine_normalize, _ = train_model(tc.SFrame(normalize_data(train_data)), name, user_id, item_id, target, users_to_recommend)

In [135]:
name = 'pearson'
target = 'purchase_count'

pearson_count, _ = train_model(tc.SFrame(train_data), name, user_id, item_id, target, users_to_recommend)

In [136]:
name = 'pearson'
target = 'purchase_dummy'

pearson_dummy, _ = train_model(tc.SFrame(train_data_dummy), name, user_id, item_id, target, users_to_recommend)

In [137]:
name = 'pearson'
target = 'scaled_purchase_freq'
pearson_normalize, _ = train_model(tc.SFrame(normalize_data(train_data)), name, user_id, item_id, target, users_to_recommend)

In [138]:
model_counts = [popularity_count, cosine_count, pearson_count]
models_dummy = [popularity_dummy, cosine_dummy, pearson_dummy]
models_normalize= [popularity_normalize, cosine_normalize, pearson_normalize]
names_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts',
                'Pearson Similarity on Purchase Counts']
names_dummy = ['Popularity Model on Purchase Dummy', 'Cosine Similarity on Purchase Dummy',
               'Pearson Similarity on Purchase Dummy']
names_normalize= ['Popularity Model on Normalized Data', 'Cosine Similarity on Normalized Data',
               'Pearson Similarity on Normalized Data']

In [139]:
# # Evaluate the models
eval_counts = tc.recommender.util.compare_models(tc.SFrame(test_data), model_counts, model_names=names_counts, verbose=False)
eval_dummy = tc.recommender.util.compare_models(tc.SFrame(test_data_dummy), models_dummy, model_names=names_dummy, verbose=False)
eval_normalize=tc.recommender.util.compare_models(tc.SFrame(test_data_norm), models_normalize, model_names=names_dummy, verbose=False)

In [140]:
#eval_counts

In [141]:
#eval_counts

f = open("/content/eval_counts.txt", "a")
print(eval_counts, file=f)
f.close()
f = open("/content/eval_dummy.txt", "a")
print(eval_dummy, file=f)
f.close()
f = open("/content/eval_normalize.txt", "a")
print(eval_dummy, file=f)
f.close()

In [142]:
METHOD = "cosine"
TARGET = "purchase_count"

In [143]:
final_model = tc.item_similarity_recommender.create(tc.SFrame(train_data), 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=TARGET,
                                                    similarity_type=METHOD,
                                                    verbose=False)

recom = final_model.recommend(users=users_to_recommend, k=12, verbose=False).to_dataframe()


In [169]:
recom.customer_id[0]

'0001076e215991bad544dd3e7312f78d9f576a1cc3ddc407b2bc9024794ead5f'

In [269]:
eval_cust_id=str(recom.customer_id[12])

In [270]:
eval_cust_id

'0001f8cef6b9702d54abf66fd89eb21014bf98567065a9b5e42f37bc99528cf5'

In [271]:
eval_art_id=transaction_subset_df[transaction_subset_df.customer_id==eval_cust_id].article_id.value_counts().head(8)

In [272]:
eval_art_id

666006006    3
502224001    1
706268020    1
817472002    1
806529003    1
808841001    1
836130002    1
855834001    1
Name: article_id, dtype: int64

In [273]:
cust_unique=list(customers_df.customer_id.unique())

In [203]:
recom.to_csv('recommendations.csv')

In [274]:
recom['article_id'].head(12)

0     715624010
1     547780001
2     733749001
3     448509014
4     759871002
5     554450004
6     720125001
7     706016015
8     711053003
9     685814003
10    772902001
11    372860001
Name: article_id, dtype: object

In [275]:
art_id_ls=list(eval_art_id.index)
pred_art_ids=list(recom['article_id'].head(12))

In [276]:
art_id_ls

[666006006,
 502224001,
 706268020,
 817472002,
 806529003,
 808841001,
 836130002,
 855834001]

In [277]:
len(art_id_ls)

8

In [278]:
article_subset_df=article_subset_df.reset_index(drop=True)

In [279]:
article_subset_df.shape

(86307, 26)

In [280]:
type(article_subset_df.article_id[0])

numpy.int64

In [302]:
def mapk(eval_ids,ref_ids):
  mapk_ls=[]
  #print(type(eval_ids),type(ref_ids))
  for eval_id in eval_ids:
    eval_id=int(eval_id)
    mapk=0
    for ref_id in ref_ids:
      cnt=0
      #print(article_subset_df.product_type_name[article_subset_df.article_id==eval_id],article_subset_df.product_type_name[article_subset_df.article_id==ref_id])
        
      if(article_subset_df.product_type_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0] == article_subset_df.product_type_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        #print(article_subset_df.product_type_name[article_subset_df.article_id==eval_id],article_subset_df.product_type_name[article_subset_df.article_id==ref_id])
        cnt=cnt+1
        mapk=1*1/cnt+mapk
      if(article_subset_df.garment_group_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0] == article_subset_df.garment_group_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        cnt=cnt+1
        #print(article_subset_df.garment_group_name[article_subset_df.article_id==eval_id],article_subset_df.garment_group_name[article_subset_df.article_id==ref_id])
        mapk=1*1/cnt+mapk
      if(article_subset_df.index_group_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0] ==  article_subset_df.index_group_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        cnt=cnt+1
        #print(article_subset_df.index_group_name[article_subset_df.article_id==eval_id],article_subset_df.index_group_name[article_subset_df.article_id==ref_id])
        
        mapk=1*1/cnt+mapk
      if(article_subset_df.department_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0] == article_subset_df.department_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        cnt=cnt+1
        #print(article_subset_df.department_name[article_subset_df.article_id==eval_id],article_subset_df.department_name[article_subset_df.article_id==ref_id])
        
        mapk=1*1/cnt+mapk
      if(article_subset_df.colour_group_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0] == article_subset_df.colour_group_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        cnt=cnt+1
        #print(article_subset_df.colour_group_name[article_subset_df.article_id==eval_id],article_subset_df.colour_group_name[article_subset_df.article_id==ref_id])
        
        mapk=1*1/cnt+mapk
      if(article_subset_df.graphical_appearance_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0]== article_subset_df.graphical_appearance_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        cnt=cnt+1
        #print(article_subset_df.graphical_appearance_name[article_subset_df.article_id==eval_id],article_subset_df.graphical_appearance_name[article_subset_df.article_id==ref_id])
        
        mapk=1*1/cnt+mapk
      if(article_subset_df.product_group_name[article_subset_df.article_id==eval_id].reset_index(drop=True).iloc[0]== article_subset_df.product_group_name[article_subset_df.article_id==ref_id].reset_index(drop=True).iloc[0]):
        cnt=cnt+1
        #print(article_subset_df.product_group_name[article_subset_df.article_id==eval_id],article_subset_df.product_group_name[article_subset_df.article_id==ref_id])
        
        mapk=1*1/cnt+mapk
    mapk_ls.append(mapk/12)
    #break
  return mapk_ls


In [303]:
#article_subset_df.columns

In [304]:
article_subset_df.section_name[article_subset_df.article_id=='708755001']

Series([], Name: section_name, dtype: object)

In [305]:
np.round(np.mean(mapk(pred_art_ids,art_id_ls)),3)
# Recommendations score of MAP@12 vary between 72 to 93, on an averafe it is on the 0.895 mark.

0.722

In [215]:
type(pred_art_ids[0]),type(art_id_ls[0])

(str, int)

In [306]:
pred_art_ids

['715624010',
 '547780001',
 '733749001',
 '448509014',
 '759871002',
 '554450004',
 '720125001',
 '706016015',
 '711053003',
 '685814003',
 '772902001',
 '372860001']

In [307]:
art_id_ls

[666006006,
 502224001,
 706268020,
 817472002,
 806529003,
 808841001,
 836130002,
 855834001]

In [308]:
#print(article_subset_df.product_type_name[article_subset_df.article_id==708755001],article_subset_df.product_type_name[article_subset_df.article_id==706016001])
      

In [309]:
# article_subset_df.product_type_name[article_subset_df.article_id==708755001]

In [310]:
# print(article_subset_df.product_type_name[article_subset_df.article_id==708755001].reset_index(drop=True).iloc[0])
# print(article_subset_df.product_type_name[article_subset_df.article_id==706016001].reset_index(drop=True).iloc[0])

In [311]:
#val1=article_subset_df.product_type_name[article_subset_df.article_id==708755001]

In [312]:
#val1.iloc[0]