<a href="https://colab.research.google.com/github/hin1799/FashionRS/blob/main/H%26M-Experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# H&M Dataset - Recommendation Systems

**Hinal Desai (202211035)**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
articles_dir = '/content/drive/MyDrive/H&M dataset/articles.csv'
customers_dir = '/content/drive/MyDrive/H&M dataset/customers.csv'
transactions_dir = '/content/drive/MyDrive/H&M dataset/transactions_train.csv'
images_dir = '/content/drive/MyDrive/H&M dataset/images'

#Reading the csv files

In [None]:
articles_df = pd.read_csv(articles_dir)

In [None]:
customers_df = pd.read_csv(customers_dir)

In [None]:
transactions_df = pd.read_csv(transactions_dir)

In [None]:
articles_df.shape

(105542, 25)

In [None]:
customers_df.shape

(1371980, 7)

In [None]:
transactions_df.shape

(31788324, 5)

# Sampling the Data

Due to limited resource availability, using a sample of the data to avoid RAM crashes.

In [None]:
#Setting the seed
random_seed = 42
np.random.seed(random_seed)

In [None]:
#10% of customer data
customers_sample = customers_df.sample(n=137198)

In [None]:
customers_sample

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
1134822,d3b658f59ad9bbf6249a7bf0db722f2f43cc47803d0329...,,,ACTIVE,NONE,72.0,da477e6d8c015e8d89487ae038b1be611c052834008252...
612735,72529945a566eb7445e51470ef00c5992976ddaf7d9f82...,,,ACTIVE,NONE,41.0,c9d1dd747df26b6fd8a8d682f1f0ed3db12bfc7041e49a...
1124456,d1c91a9050fcb6da04f0643c6b6d38b002b6ff12d66e80...,,,ACTIVE,NONE,22.0,760fd89d83df7842c04f3a6d68265270bc2333c43cabf7...
662579,7bad01c1ad7a9bccc9270a29bf24279b4e19b41afc1b4a...,1.0,,ACTIVE,Regularly,24.0,032fdbc74a7144e1bcbc2addc9d5c8a0d8f39223754aac...
404009,4b889f848f59f9bf787b16626f96d0390c3f9ab3ed3edc...,,,ACTIVE,NONE,56.0,a0f3f7d2ae3b89be98d44524b3a6743c1916986e507ef9...
...,...,...,...,...,...,...,...
1267446,ec87ffad52857ca9b93f0c7358bf25fffb4bd61ba08881...,1.0,1.0,ACTIVE,Regularly,25.0,ee06243e58a71529891889203876c19a9c2e5b037d2445...
28249,054cb9b7f48001938d451af0a63cbbc5fa1345310c75b7...,,,ACTIVE,NONE,18.0,514f6322722a07429ba69054201b243f6e82743705db65...
1214264,e292f906885a35ac1272685a650a6beb46353a91e49fe1...,,,ACTIVE,,72.0,2701daeebb6f0f0583c065e12819b5d7586de8181001bd...
477017,5917e32cb0b7fb344d469bd41e1a42c02c760de1578723...,,,ACTIVE,NONE,50.0,1c79c3a5df726e16e665ffc6720863984c260771e15cd0...


Due to resource limitations, we will consider only last 6 weeks of data

In [None]:
transactions_df["timestamp"] = pd.to_datetime(transactions_df["t_dat"])
transactions_df["week_number"] = transactions_df["timestamp"].dt.isocalendar().week
grouped_df = transactions_df.groupby(['customer_id', 'article_id', pd.Grouper(key='week_number')]).size().reset_index(name='purchase_count')
data = grouped_df[grouped_df['week_number']>=47] #last 6 week data

In [None]:
data

Unnamed: 0,customer_id,article_id,week_number,purchase_count
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,176209023,52,1
4,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,625548001,52,1
5,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,627759010,52,1
7,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,694736004,48,1
10,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,785186005,47,1
...,...,...,...,...
28251920,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,638554001,50,1
28251921,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,648173003,49,1
28251927,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,690478001,49,1
28251928,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,701083001,49,1


From the customer's sample, only consider the customers who made a purchase in the last 6 weeks

In [None]:
customer_id_in_data = data['customer_id'].unique()
customer_sample_filtered = customers_sample[customers_sample['customer_id'].isin(customer_id_in_data)]

customers_sample.shape[0], customer_sample_filtered.shape[0]

(137198, 54476)

Only considering the articles that were purchased in the last 6 weeks.

In [None]:
article_id_in_data = data['article_id'].unique()
articles_sample_filtered = articles_df[articles_df['article_id'].isin(article_id_in_data)]

articles_df.shape[0], articles_sample_filtered.shape[0]

(105542, 58981)

In [None]:
#Renaming

customers = customer_sample_filtered
articles = articles_sample_filtered

Creating a merged dataframe with the last 6 weeks of transaction data and the customer and article metadata.

In [None]:
merged_df = pd.merge(data, customers, on='customer_id')
merged_df = pd.merge(merged_df, articles, on='article_id')

In [None]:
#item-feature matrix
df = merged_df[['customer_id', 'article_id', 'week_number', 'purchase_count', 'prod_name', 'product_type_name', 'product_group_name',
                'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name',
                'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name']]

# Train-Val-Test Split

In [None]:
#train data - week 47 to 50
train = df[(df['week_number']>=47) & (df['week_number']<=50)]

#val data - week 51
val = df[df['week_number']==51]

#test data - week 52
test = df[df['week_number']==52]

# Content Based Using Metadata

In [None]:
features = ['product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

In [None]:
#Building feature space

df_temp = train[['customer_id', 'article_id'] + features]
df_temp = pd.get_dummies(df_temp, columns=features)

In [None]:
df_temp

Unnamed: 0,customer_id,article_id,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Furniture,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Garment and Shoe care,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
0,0000ae1bbb25e04bdc7e35f718e852adfb3fbb72ef38b3...,573085033,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,02342ca1b89782fce40cdda06d56e17e67d26c6ab31cf3...,573085033,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,02e77834164257b0d8fe9b6312fdb31e13fbbbe1ff2ef3...,573085033,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0541671f9306ec08db46f9dd900d5aab6892ce1e2f7a34...,573085033,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
6,09ebde71d578e0ea0d12319bce0434dbcb0c9ef84c9be4...,573085033,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304610,ffed71450acc902268fc4b99dfdc28f1a22583f573863b...,644763002,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
304611,ffed71450acc902268fc4b99dfdc28f1a22583f573863b...,670645001,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
304612,ffee31d0bdc92684ba203acae0608d253f869b8ee3a086...,623277004,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
304613,ffee31d0bdc92684ba203acae0608d253f869b8ee3a086...,626887003,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
min_items = 2
cust_grouped = df_temp.groupby('customer_id')

l=[]
customer_ids = []
article_ids = []
for key in cust_grouped.groups.keys():
  temp = cust_grouped.get_group(key)
  if temp.article_id.nunique() >= min_items:
    l.append(temp.drop('article_id', axis=1).sum(numeric_only=True).values)
    customer_ids.append(key)
    article_ids.extend(temp.article_id.values.tolist())

**User Features**

In [None]:
user_feature_df = pd.DataFrame(l, columns=df_temp.columns[2:])
normalized_user_feature_df = user_feature_df.div(user_feature_df.sum(axis=1), axis=0)
normalized_user_feature_df.insert(0, 'customer_id', customer_ids)
normalized_user_feature = normalized_user_feature_df.set_index("customer_id")
normalized_user_feature

Unnamed: 0_level_0,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Furniture,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Garment and Shoe care,product_group_name_Items,product_group_name_Nightwear,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000ae1bbb25e04bdc7e35f718e852adfb3fbb72ef38b3fa01ce4272a6326730,0.000000,0.0,0.0,0.0,0.000000,0.100000,0.000000,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.100000,0.000,0.000000,0.00,0.0
0000ffa57a8098a6cd99c0f0782b3e674f47d68db69dc173c7e67e0f68170c4c,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.066667,0.0,0.0,0.0,...,0.0,0.0000,0.033333,0.000000,0.000000,0.000000,0.000,0.000000,0.00,0.0
000538c14c490fee547e2703439f70bdb8748e2a62f875d63b7320c17ae20fc2,0.000000,0.0,0.0,0.0,0.031250,0.012500,0.031250,0.0,0.0,0.0,...,0.0,0.0125,0.000000,0.000000,0.025000,0.000000,0.000,0.000000,0.00,0.0
000a9291e5aa007a10faf2177671a8501dade80e6679218d65f53c1497b17f87,0.020000,0.0,0.0,0.0,0.040000,0.030000,0.000000,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.020000,0.000,0.000000,0.01,0.0
000b2018cebd824b6abdcbe9a9178067124ea4516a2d8cb3b222643397a54b0a,0.000000,0.0,0.0,0.0,0.000000,0.100000,0.000000,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.100000,0.000,0.000000,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
fffac3618e0084b4f22787586be7701efa2bb564c308613f67a8fdbe1dfc38d9,0.000000,0.0,0.0,0.0,0.000000,0.016667,0.050000,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.016667,0.000,0.033333,0.00,0.0
fffae8eb3a282d8c43c77dd2ca0621703b71e90904dfde2189bdd644f59071dd,0.008333,0.0,0.0,0.0,0.000000,0.050000,0.033333,0.0,0.0,0.0,...,0.0,0.0000,0.016667,0.000000,0.000000,0.016667,0.025,0.000000,0.00,0.0
fffb6dda3fec0b5c58aae22a19c039e0087da6625b54aa04681ec5e188d6c364,0.016667,0.0,0.0,0.0,0.016667,0.016667,0.016667,0.0,0.0,0.0,...,0.0,0.0000,0.016667,0.000000,0.000000,0.033333,0.000,0.016667,0.00,0.0
fffebe4b1d5b6c0605668265c22330f468a7257c9e5081c0ce2b393104e11163,0.000000,0.0,0.0,0.0,0.100000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.0000,0.000000,0.000000,0.000000,0.000000,0.000,0.000000,0.00,0.0


**Item Features**

In [None]:
item_feature_df = df_temp.drop_duplicates(subset='article_id')
item_feature_df = item_feature_df[item_feature_df.article_id.isin(article_ids)].drop('customer_id', axis=1)
item_feature = item_feature_df.set_index('article_id')
item_feature

Unnamed: 0_level_0,product_group_name_Accessories,product_group_name_Bags,product_group_name_Cosmetic,product_group_name_Furniture,product_group_name_Garment Full body,product_group_name_Garment Lower body,product_group_name_Garment Upper body,product_group_name_Garment and Shoe care,product_group_name_Items,product_group_name_Nightwear,...,garment_group_name_Shorts,garment_group_name_Skirts,garment_group_name_Socks and Tights,garment_group_name_Special Offers,garment_group_name_Swimwear,garment_group_name_Trousers,garment_group_name_Trousers Denim,"garment_group_name_Under-, Nightwear",garment_group_name_Unknown,garment_group_name_Woven/Jersey/Knitted mix Baby
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
573085033,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
573085038,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
215589001,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
555353017,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
817574001,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644763002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
670645001,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
623277004,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
626887003,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Without PCA

### Training

In [None]:
#choosing less number of samples, due to RAM crash
user_feature_sample = normalized_user_feature.head(10000)
item_feature_sample = item_feature.head(10000)

In [None]:
score = user_feature_sample.dot(item_feature_sample.T)

In [None]:
score

article_id,573085033,573085038,215589001,555353017,817574001,561277001,699493001,244267032,356289021,419634001,...,838178003,799382001,568861008,671599001,843701001,535388001,535388008,541313006,743530004,743530022
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000ae1bbb25e04bdc7e35f718e852adfb3fbb72ef38b3fa01ce4272a6326730,1.000000,1.000000,0.100000,0.300000,0.300000,0.100000,0.300000,0.300000,0.100000,0.000000,...,0.200000,0.200000,0.300000,0.100000,0.400000,0.000000,0.000000,0.300000,0.100000,0.200000
0000ffa57a8098a6cd99c0f0782b3e674f47d68db69dc173c7e67e0f68170c4c,0.233333,0.233333,0.400000,0.600000,0.600000,0.400000,0.300000,0.333333,0.133333,0.133333,...,0.166667,0.433333,0.300000,0.133333,0.300000,0.166667,0.100000,0.000000,0.133333,0.033333
000538c14c490fee547e2703439f70bdb8748e2a62f875d63b7320c17ae20fc2,0.081250,0.081250,0.350000,0.112500,0.118750,0.350000,0.375000,0.318750,0.493750,0.412500,...,0.050000,0.387500,0.118750,0.206250,0.125000,0.362500,0.262500,0.143750,0.312500,0.100000
000a9291e5aa007a10faf2177671a8501dade80e6679218d65f53c1497b17f87,0.230000,0.230000,0.250000,0.300000,0.230000,0.250000,0.340000,0.240000,0.250000,0.270000,...,0.180000,0.400000,0.180000,0.100000,0.170000,0.170000,0.080000,0.080000,0.210000,0.070000
000b2018cebd824b6abdcbe9a9178067124ea4516a2d8cb3b222643397a54b0a,0.200000,0.200000,0.300000,0.000000,0.000000,0.300000,0.300000,0.200000,0.700000,0.500000,...,0.000000,0.300000,0.050000,0.200000,0.000000,0.400000,0.400000,0.300000,0.400000,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4f0b3fad2fee9c7099ccacf1a40f206650fc5ba98677dafef8dc6e6f23c1cc16,0.060000,0.060000,0.240000,0.080000,0.080000,0.240000,0.240000,0.240000,0.460000,0.420000,...,0.040000,0.280000,0.040000,0.200000,0.040000,0.380000,0.260000,0.200000,0.300000,0.180000
4f0dae67c5b797a435c86498f0ec7f403adf6c34bebea15bebef974f9ba50eb8,0.109091,0.109091,0.190909,0.136364,0.127273,0.190909,0.236364,0.263636,0.309091,0.327273,...,0.090909,0.218182,0.163636,0.218182,0.181818,0.290909,0.236364,0.163636,0.172727,0.072727
4f0e8dfcfbb39d401be254fe28eab6bde093fbd1f7bfced177d4c664648c3fab,0.262500,0.262500,0.200000,0.212500,0.200000,0.200000,0.325000,0.262500,0.300000,0.212500,...,0.087500,0.312500,0.162500,0.125000,0.200000,0.175000,0.150000,0.150000,0.212500,0.125000
4f16df5d1cda6d9364f66fe2b0eb4fd96b44aacef26bbfd93cdf548e3b23cf0c,0.300000,0.300000,0.257143,0.328571,0.328571,0.257143,0.342857,0.371429,0.171429,0.185714,...,0.214286,0.342857,0.557143,0.171429,0.385714,0.185714,0.157143,0.014286,0.171429,0.071429


### Evaluation

In [None]:
test = test[['customer_id', 'article_id', 'purchase_count']]

#create a pivot table for test
pivot_test = train.head(20000).pivot_table(index='customer_id', columns='article_id', values='purchase_count', fill_value=0)

In [None]:
#finding the common user ids

pivot_user_ids = set(pivot_test.index)
score_user_ids = set(score.index)
common_user_id = pivot_user_ids.intersection(score_user_ids)

pivot_item_ids = set(pivot_test.columns)
score_item_ids = set(score.columns)
common_item_id = pivot_item_ids.intersection(score_item_ids)

In [None]:
#Keep only common values

pivot = pivot_test.loc[common_user_id, common_item_id]
scores = score.loc[common_user_id, common_item_id]

In [None]:
relevance_mat = pivot > 0

**Mean Average Precision**

In [None]:
from sklearn.metrics import average_precision_score
map_values = []

for c_id in relevance_mat.index:
  relevance = relevance_mat.loc[c_id].values
  pred = scores.loc[c_id].values

  top_indices = pred.argsort()[::-1][:5] #k=5 -> MAP@5
    
  relevance_top = relevance[top_indices]
  pred_top = pred[top_indices]

  ap = average_precision_score(relevance_top, pred_top)

  map_values.append(ap)

map_mean = np.mean(map_values)

In [None]:
print("MAP without PCA: (metadata only)" , map_mean)

MAP without PCA: (metadata only) 0.17958023069061177


**NDCG**

In [None]:
from sklearn.metrics import ndcg_score

relevance_mat = pivot > 0 
ndgc_values = []

for c_id in relevance_mat.index:
  relevance = relevance_mat.loc[c_id].values
  pred = scores.loc[c_id].values
  
  top_indices = pred.argsort()[::-1][:5]
    
  relevance_top = relevance[top_indices]
  pred_top = pred[top_indices]

  ndgc = ndcg_score([relevance_top], [pred_top], k=len(relevance_top))

  ndgc_values.append(ndgc)

ndcg = np.mean(ndgc_values)
print("NDCG without PCA: (metadata only)", ndcg)

NDCG without PCA: (metadata only) 0.28261514085339795


In [None]:
from sklearn.metrics import recall_score

mar_values = []

for c_id in relevance_mat.index:
  relevance = relevance_mat.loc[c_id].values
  pred = scores.loc[c_id].values

  top_indices = pred.argsort()[::-1][:5]
    
  relevance_top = relevance[top_indices]
  pred_top = pred[top_indices]

  pred_binary = (pred_top >= 0.5).astype(int)

  rec = recall_score(relevance_top, pred_binary, average='macro')

  mar_values.append(rec)

mar_mean = np.mean(mar_values)

In [None]:
print("MAR without PCA: (metadata only)", mar_mean)

MAR without PCA: (metadata only) 0.47725361366622865


## With PCA

### Training

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=100)
pca.fit(normalized_user_feature) #using entire user feature matrix instead of sampling
pca.explained_variance_ratio_.sum()

0.9556254406389474

In [None]:
user_features_pca = pd.DataFrame(pca.transform(normalized_user_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(normalized_user_feature.index)

In [None]:
item_features_pca = pd.DataFrame(pca.transform(item_feature), columns=['component_{}'.format(i) for i in range(1, 101)]).set_index(item_feature.index)

In [None]:
user_sample = user_features_pca.head(10000)
item_sample = item_features_pca.head(10000)

In [None]:
scores_pca = user_sample.dot(item_sample.T)

### Evaluation

In [None]:
pivot_user_ids = set(pivot_test.index)
score_user_ids = set(scores_pca.index)
common_user_id = pivot_user_ids.intersection(score_user_ids)

pivot_item_ids = set(pivot_test.columns)
score_item_ids = set(scores_pca.columns)
common_item_id = pivot_item_ids.intersection(score_item_ids)

In [None]:
#Keep only values common
pivot = pivot_test.loc[common_user_id, common_item_id]
scores = scores_pca.loc[common_user_id, common_item_id]

In [None]:
relevance_mat = pivot > 0

In [None]:
from sklearn.metrics import average_precision_score
map_values = []

for c_id in relevance_mat.index:
  relevance = relevance_mat.loc[c_id].values
  pred = scores.loc[c_id].values

  top_indices = pred.argsort()[::-1][:5] #top5

  relevance_top = relevance[top_indices]
  pred_top = pred[top_indices]

  ap = average_precision_score(relevance_top, pred_top)

  map_values.append(ap)

map_mean_pca = np.mean(map_values)

In [None]:
print("MAP with PCA: (metadata only)",map_mean_pca)

MAP with PCA: (metadata only) 0.2472094466345452


In [None]:
relevance_mat = pivot > 0 
ndgc_values = []

for c_id in relevance_mat.index:
  relevance = relevance_mat.loc[c_id].values
  pred = scores.loc[c_id].values

  top_indices = pred.argsort()[::-1][:5] #top5
    
  relevance_top = relevance[top_indices]
  pred_top = pred[top_indices]

  ndgc = ndcg_score([relevance_top], [pred_top], k=len(relevance_top))

  ndgc_values.append(ndgc)

ndcg = np.mean(ndgc_values)
print("NDCG with PCA (metadata only)", ndcg)

NDCG with PCA (metadata only) 0.3260303924912133


In [None]:
from sklearn.metrics import recall_score

mar_values = []

for c_id in relevance_mat.index:
  relevance = relevance_mat.loc[c_id].values
  pred = scores.loc[c_id].values

  top_indices = pred.argsort()[::-1][:5] #top5
    
  relevance_top = relevance[top_indices]
  pred_top = pred[top_indices]

  pred_binary = (pred_top >= 0.5).astype(int)

  rec = recall_score(relevance_top, pred_binary, average='macro')

  mar_values.append(rec)

mar_mean = np.mean(mar_values)

In [None]:
print("MAR with PCA: (metadata only)", mar_mean)

MAR with PCA: (metadata only) 0.7634296977660974


# Content Based Using Description+Metadata

In [None]:
#train data - week 47 to 50
train = merged_df[(merged_df['week_number']>=47) & (merged_df['week_number']<=50)]

#val data - week 51
val = merged_df[merged_df['week_number']==51]

#test data - week 52
test = merged_df[merged_df['week_number']==52]

### Training

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Function to generate a corpus by combining the metadata and description of the articles.

In [None]:
def description_corpus(article_id):
  article = train[train['article_id'] == article_id]
  prod_type = ' '.join(article['product_type_name'])
  prod_group = ' '.join(article['product_group_name'])
  graph_appearance = ' '.join(article['graphical_appearance_name'])
  colour_group = ' '.join(article['colour_group_name'])
  perceived_color = ' '.join(article['perceived_colour_master_name'])
  dept_name = ' '.join(article['department_name'])
  index_name = ' '.join(article['index_name'])
  garment_group = ' '.join(article['garment_group_name'])

  lst = [prod_type, prod_group, graph_appearance, colour_group, perceived_color, dept_name, index_name, garment_group]

  detail_description = article['detail_desc']
  detail_desc = ''.join(detail_description)
  d = re.sub("[^a-zA-Z]"," ", detail_desc)
  # print(d)
  words = d.lower().split()
  #print(words)
  #removing stop words
  stops = set(stopwords.words('english'))
  list_without_stopwords = [w for w in words if not w in stops]
  # print(list_without_stopwords)
  wnl = WordNetLemmatizer()
  corpus = list(set([wnl.lemmatize(word) for word in list_without_stopwords]))
  corpus = corpus + lst
  #print(corpus)
  
  return corpus

In [None]:
new_df = pd.DataFrame(columns = ['article_id', 'desc_corpus'])

**Helper functions**

In [None]:
def find_article_id(idx):
    return new_df.iloc[idx]['article_id']

def find_article_name(article_id):
  return (article_id, articles_df[articles_df['article_id'] == article_id]['prod_name'].values[0])

In [None]:
#Dropping empty descriptions
train = train.dropna(subset=['detail_desc'])
test = test.dropna(subset=['detail_desc'])

In [None]:
train_df_filtered = train[train['customer_id'].isin(test['customer_id'])]
test_df_filtered = test[test['customer_id'].isin(train['customer_id'])]

In [None]:
train_df_filtered.shape

(47734, 34)

In [None]:
train_df_filtered.article_id.nunique()

16558

In [None]:
for article_id in train_df_filtered['article_id'].unique():
  v = description_corpus(article_id)
  new_df = new_df.append({'article_id' : article_id, 'desc_corpus': v}, ignore_index = True)

In [None]:
train_purchases_df = pd.merge(new_df, train_df_filtered, on="article_id", how="inner")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

def vectorize_articles_desc():
  articles_corpus_lst = []
  for i in new_df['desc_corpus']:
    line = " ".join(i)
    articles_corpus_lst.append(line)
  
  X = tfidf.fit_transform(articles_corpus_lst)
  return X

In [None]:
X = vectorize_articles_desc()

### Evaluation

In [None]:
def recommend_top_5(cust_id):
    article_corpus = X
    # Get the item description for the customer
    item_desc = train_purchases_df.loc[train_purchases_df['customer_id'] == cust_id].values[0][1]

    # Convert the item description to a string and calculate the TF-IDF vector
    item_desc_str = ' '.join(item_desc)
    vect = tfidf.transform([item_desc_str])

    # Calculate the cosine similarity between the item description and all articles in the corpus
    similarity_scores = cosine_similarity(vect, article_corpus)[0]

    # Sort the articles in descending order of similarity score and exclude items with a similarity score of 1
    similar_articles_idx = np.argsort(-similarity_scores)
    similar_articles_idx = [i for i in similar_articles_idx if similarity_scores[i] != 1]
    #print(similar_articles_idx)

    # Find the top 5 recommended articles
    recommended_articles = []
    for i in similar_articles_idx[:5]:
        article_id = find_article_id(i)
        recommended_articles.append(article_id)
    
    return recommended_articles

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import average_precision_score

def evaluate_map(recommended_articles):
    # Calculate the Mean Average Precision (MAP)
    purchased_items = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
    precision = []
    for i, article in enumerate(recommended_articles):
        if article in purchased_items:
            precision.append(sum([1 for j in recommended_articles[:i+1] if j in purchased_items])/(i+1))
    if len(precision) == 0:
        map_score = 0
    else:
        map_score = np.mean(precision)
    #print("MAP:", map_score)

    return map_score

In [None]:
def evaluate_ndcg(recommended_articles):
    # Calculate the NDCG
    purchased_items = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
    relevance = []
    for i, article in enumerate(recommended_articles):
      if article in purchased_items:
        relevance.append(1)
      else:
        relevance.append(0)
    
    if sum(relevance) == 0:
      ndcg_score = 0
    else:
      dcg = relevance[0]
      for i in range(1, len(relevance)):
        dcg+=relevance[i] / np.log2(i+1)
      idcg = sum(sorted(relevance, reverse=True)[:sum(relevance)])
      ndcg_score = dcg/idcg
    return ndcg_score

In [None]:
def evaluate_recall(recommended_articles):
    # Calculate the recall score
    purchased_items = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
    true_positives = 0
    false_negatives = 0
    for i, article in enumerate(recommended_articles):
        if article in purchased_items:
            true_positives += 1
        else:
            false_negatives += 1
    if true_positives == 0:
        recall_score = 0
    else:
        recall_score = true_positives / (true_positives + false_negatives)
    #print("Recall:", recall_score)

    return recall_score

In [None]:
all_map = []
all_ndcg = []
all_recall = []

for cust_id in train_purchases_df['customer_id'].unique():
  recommendations = recommend_top_5(cust_id)
  map = evaluate_map(recommendations) #Calculate MAP
  ndcg = evaluate_ndcg(recommendations) #Calculate NDCG
  recall = evaluate_recall(recommendations) #Calculate Recall
  all_map.append(map)
  all_ndcg.append(ndcg)
  all_recall.append(recall)

map_final = np.mean(all_map)
ndcg_final = np.mean(all_ndcg)
recall_final = np.mean(all_recall)

print("MAP using description+metadata:", map_final)
print("NDCG using description+metadata:", ndcg_final)
print("Recall using description+metadata:", recall_final)

MAP using description+metadata: 0.011086290528465662
NDCG using description+metadata: 0.013928597235114765
Recall using description+metadata: 0.0035486341587949964


# Content Based Using Description 

### Training

In [None]:
def description_corpus(article_id):
  article = train[train['article_id'] == article_id]

  detail_description = article['detail_desc']
  detail_desc = ''.join(detail_description)
  d = re.sub("[^a-zA-Z]"," ", detail_desc)

  words = d.lower().split()

  #removing stop words
  stops = set(stopwords.words('english'))
  list_without_stopwords = [w for w in words if not w in stops]
  # print(list_without_stopwords)
  wnl = WordNetLemmatizer()
  corpus = list(set([wnl.lemmatize(word) for word in list_without_stopwords]))
  
  #print(corpus)
  
  return corpus

In [None]:
new_df = pd.DataFrame(columns = ['article_id', 'desc_corpus'])

In [None]:
for article_id in train_df_filtered['article_id'].unique():
  v = description_corpus(article_id)
  new_df = new_df.append({'article_id' : article_id, 'desc_corpus': v}, ignore_index = True)

In [None]:
new_df

Unnamed: 0,article_id,desc_corpus
0,573085033,"[pocket, stretch, washed, zip, waist, leg, jea..."
1,573085038,"[pocket, stretch, washed, zip, waist, leg, jea..."
2,215589001,"[space, tummy, denier, tights, growing, extra]"
3,561277001,"[shape, opaque, encouraging, tummy, denier, bl..."
4,699493001,"[seam, long, lace, edge, scalloped, dress, par..."
...,...,...
16553,661644005,"[scarf, detail, elasticated, satin, scrunchie]"
16554,665507001,"[airy, lace, trim, neckline, sleeve, butterfly..."
16555,820165002,"[ribbed, fitted, top, jersey, short]"
16556,644763002,"[lace, pair, fastening, shape, bust, narrow, d..."


In [None]:
train_purchases_df = pd.merge(new_df, train_df_filtered, on="article_id", how="inner")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

def vectorize_articles_desc():
  articles_corpus_lst = []
  for i in new_df['desc_corpus']:
    line = " ".join(i)
    articles_corpus_lst.append(line)
  
  X = tfidf.fit_transform(articles_corpus_lst)
  return X

In [None]:
X = vectorize_articles_desc()

In [None]:
X

<16558x1401 sparse matrix of type '<class 'numpy.float64'>'
	with 243901 stored elements in Compressed Sparse Row format>

### Evaluation

In [None]:
all_map = []
all_ndcg = []
all_recall = []

for cust_id in train_purchases_df['customer_id'].unique():
  recommendations = recommend_top_5(cust_id)
  map = evaluate_map(recommendations) #Calculate MAP
  ndcg = evaluate_ndcg(recommendations) #Calculate NDCG
  recall = evaluate_recall(recommendations) #Calculate Recall
  all_map.append(map)
  all_ndcg.append(ndcg)
  all_recall.append(recall)

map_final = np.mean(all_map)
ndcg_final = np.mean(all_ndcg)
recall_final = np.mean(all_recall)

print("MAP using description+metadata:", map_final)
print("NDCG using description+metadata:", ndcg_final)
print("Recall using description+metadata:", recall_final)

MAP using description+metadata: 0.007964215811420304
NDCG using description+metadata: 0.010807598509243795
Recall using description+metadata: 0.00273168241000766


# Content Based Using Image Embeddings

The image embeddings were trained on local machine using ResNet50 pretrained network.

In [None]:
#Loading the embeddings pickle files

pickle1 = '/content/drive/MyDrive/H&M dataset/hm_embeddings_100523_01.pkl'
pickle2 = '/content/drive/MyDrive/H&M dataset/hm_embeddings_100523_02.pkl'
pickle3 = '/content/drive/MyDrive/H&M dataset/hm_embeddings_100523_03.pkl'
pickle4 = '/content/drive/MyDrive/H&M dataset/hm_embeddings_100523_04.pkl'

In [None]:
df1 = pd.read_pickle(pickle1)
df2 = pd.read_pickle(pickle2)
df3 = pd.read_pickle(pickle3)
df4 = pd.read_pickle(pickle4)

In [None]:
df = pd.concat([df1, df2, df3, df4])

In [None]:
df.shape

(24296, 3)

In [None]:
df.head()

Unnamed: 0,Image Path,Embedding,Image id
0,E:\H&M dataset\filtered_imgs\all\010\010877501...,"[[0.30005416, 1.8002645, 0.0, 0.16126332, 0.18...",108775015
1,E:\H&M dataset\filtered_imgs\all\010\010877504...,"[[0.0, 2.1636605, 0.031609662, 1.3541696, 0.04...",108775044
2,E:\H&M dataset\filtered_imgs\all\010\010877505...,"[[0.6450878, 0.6914531, 0.050121546, 0.3615835...",108775051
3,E:\H&M dataset\filtered_imgs\all\011\011006500...,"[[0.02863922, 2.0282106, 0.08630299, 0.2328322...",110065001
4,E:\H&M dataset\filtered_imgs\all\011\011006500...,"[[0.03895648, 1.2950653, 0.0, 0.018433135, 0.0...",110065002


In [None]:
#Finding the similarity matrix
from sklearn.metrics.pairwise import cosine_similarity

embeddings_lst = df['Embedding'].to_list()
embeddings = np.concatenate(embeddings_lst, axis=0)
sim_matrix = cosine_similarity(embeddings)

In [None]:
df['Image id'] = [int(x) for x in df['Image id']]

In [None]:
merged_df_filtered = merged_df[merged_df['article_id'].isin(df['Image id'])]

In [None]:
merged_df_filtered.shape

(108351, 34)

In [None]:
#Train
train_df = merged_df_filtered[(merged_df_filtered['week_number']>=47) & (merged_df_filtered['week_number']<=50)]

#Val
val_df = merged_df_filtered[merged_df_filtered['week_number'] == 51]

#Test
test_df = merged_df_filtered[merged_df_filtered['week_number'] == 52]

In [None]:
train_df_filtered = train_df[train_df['customer_id'].isin(test_df['customer_id'])]
test_df_filtered = test_df[test_df['customer_id'].isin(train_df['customer_id'])]

In [None]:
train_df_filtered.shape, test_df_filtered.shape

((10646, 34), (7152, 34))

In [None]:
def find_similar_images(img_index, k=5):
    sim_scores = sim_matrix[img_index]
    top_k = sim_scores.argsort()[::-1][1:k+1]
    top_k_paths = df.iloc[top_k]['Image'].values
    top_k_scores = sim_scores[top_k]
    return top_k_paths, top_k_scores     

**Mean Average Precision**

In [None]:
def content_based_using_image(cust_id):
    prev_purchased = train_df_filtered[train_df_filtered['customer_id'] == cust_id]['article_id'].values
    img_location = '0' + str(prev_purchased[0])
    img_subfolder = str(img_location)[0:3]
    
    path = images_dir+'\\'+img_subfolder+'\\'+img_location+'.jpg'
    
    index = df.loc[df['Image Path'] == path].index
    if len(index)!=0:
        idx = index.values[0]
    
        paths, scores = find_similar_images(idx, 5)
    
        recommended_articles = []
        for path in paths:
            i = path.split('\\')[4].split('.')[0]
            recommended_articles.append(i)
    
        purchased_articles = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
        precision = []
        for i, article in enumerate(recommended_articles):
            if article in purchased_articles:
                precision.append(sum([1 for j in recommended_articles[:i+1] if j in purchased_articles])/(i+1))
    
        if len(precision) == 0:
            map_score = 0
        else:
            map_score = np.mean(precision)
    else:
        map_score = 0
    return map_score

In [None]:
all_map = []
for cust_id in train_df_filtered['customer_id'].unique():
    mean_avg_precision = content_based_using_image(cust_id)
    all_map.append(mean_avg_precision)

In [None]:
np.mean(all_map)

0.0

**Recall**

In [None]:
def content_based_using_image_recall(cust_id):
    prev_purchased = train_df_filtered[train_df_filtered['customer_id'] == cust_id]['article_id'].values
    img_location = '0' + str(prev_purchased[0])
    img_subfolder = str(img_location)[0:3]
    
    path = images_dir+'\\'+img_subfolder+'\\'+img_location+'.jpg'
    
    index = df.loc[df['Image Path'] == path].index
    if len(index)!=0:
        idx = index.values[0]
    
        paths, scores = find_similar_images(idx, 5)
    
        recommended_articles = []
        for path in paths:
            i = path.split('\\')[4].split('.')[0]
            recommended_articles.append(i)
    
        purchased_articles = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
        tp = 0
        fn = 0
        for i, article in enumerate(recommended_articles):
            if article in purchased_articles:
                tp+=1
            else:
                fn+=1
        if tp == 0:
            recall_score=0
        else:
            recall_score = tp/(tp+fn)
    else:
        recall_score = 0
    return recall_score

In [None]:
all_recall = []
for cust_id in train_df_filtered['customer_id'].unique():
    recall = content_based_using_image_recall(cust_id)
    all_recall.append(recall)

In [None]:
np.mean(all_recall)

0.0

**NDCG**

In [None]:
def content_based_using_image_ndcg(cust_id):
    prev_purchased = train_df_filtered[train_df_filtered['customer_id'] == cust_id]['article_id'].values[0]
    img_location = '0' + str(prev_purchased)
    img_subfolder = str(img_location)[0:3]
    
    path = images_dir+'\\'+img_subfolder+'\\'+img_location+'.jpg'
    
    index = df.loc[df['Image Path'] == path].index
    if(len(index)!=0):
        idx = index.values[0]
    
        paths, scores = find_similar_images(idx, 5)
    
        recommended_articles = []
        for path in paths:
            i = path.split('\\')[4].split('.')[0]
            recommended_articles.append(i)
    
        purchased_articles = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
        relevance = []
        for i, article in enumerate(recommended_articles):
            if article in purchased_articles:
                relevance.append(1)
            else:
                relevance.append(0)
    
        if sum(relevance) == 0:
            ndcg_score = 0
        else:
            dcg = relevance[0]
            for i in range(1, len(relevance)):
                dcg += relevance[i] / np.log2(i+1)
            idcg = sum(sorted(relevance, reverse=True)[:sum(relevance)])
            ndcg_score = dcg / idcg
    else:
        ndcg_score=0
    
    return ndcg_score

In [None]:
ndcg_scores = []
for cust_id in train_df_filtered['customer_id'].unique():
    ndcg_score = content_based_using_image_ndcg(cust_id)
    ndcg_scores.append(ndcg_score)
    
mean_ndcg = np.mean(ndcg_scores)

In [None]:
mean_ndcg

0.0

**Using euclidean distance to produce more diverse recommendations**

In [None]:
df['Embedding'].shape

(24296,)

In [None]:
from sklearn.metrics.pairwise import euclidean_distances

embeddings_lst = df['Embedding'].to_list()
embeddings = np.concatenate(embeddings_lst, axis=0)
sim_matrix = euclidean_distances(embeddings)

def find_similar_images(img_index, k=5):
    sim_scores = sim_matrix[img_index]
    top_k = sim_scores.argsort()[1:k+1]
    top_k_paths = df.iloc[top_k]['Image Path'].values
    top_k_scores = sim_scores[top_k]
    return top_k_paths, top_k_scores

In [None]:
def content_based_using_image_recall():
    recall_scores = []
    for cust_id in test_df_filtered['customer_id'].unique():
        prev_purchased = train_df_filtered[train_df_filtered['customer_id'] == cust_id]['article_id'].values
        img_location = '0' + str(prev_purchased[0])
        img_subfolder = str(img_location)[0:3]

        path = images_dir+'\\'+img_subfolder+'\\'+img_location+'.jpg'

        index = df.loc[df['Image Path'] == path].index
        if len(index)!=0:
            idx = index.values[0]

            paths, scores = find_similar_images(idx, len(test_df_filtered[test_df_filtered['customer_id'] == cust_id]))

            recommended_articles = []
            for path in paths:
                i = path.split('\\')[4].split('.')[0]
                recommended_articles.append(i)

            purchased_articles = test_df_filtered[test_df_filtered['customer_id'] == cust_id]['article_id'].values
            tp = 0
            fn = 0
            for i, article in enumerate(recommended_articles):
                if article in purchased_articles:
                    tp+=1
                else:
                    fn+=1
            if tp == 0:
                recall_score=0
            else:
                recall_score = tp/(tp+fn)
        else:
            recall_score = 0
        recall_scores.append(recall_score)
    
    return sum(recall_scores)/len(recall_scores)


In [None]:
content_based_using_image_recall()

0.0