In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from datetime import datetime
from PIL import Image



In [2]:
print(f"files and folders: {os.listdir('/kaggle/input/h-and-m-personalized-fashion-recommendations/')}")
print("Subfolders in images folder: ", len(list(os.listdir("/kaggle/input/h-and-m-personalized-fashion-recommendations/images"))))

files and folders: ['sample_submission.csv', 'articles.csv', 'transactions_train.csv', 'images', 'customers.csv']
Subfolders in images folder:  86


In [3]:
total_folders = total_files = 0
folder_info = []
images_names = []
for base, dirs, files in tqdm(os.walk('/kaggle/input/h-and-m-personalized-fashion-recommendations/')):
    for directories in dirs:
        folder_info.append((directories, len(os.listdir(os.path.join(base, directories)))))
        total_folders += 1
    for _files in files:
        total_files += 1
        if len(_files.split(".jpg"))==2:
            images_names.append(_files.split(".jpg")[0])

88it [01:56,  1.33s/it]


In [4]:
print(f"Total number of folders: {total_folders}\nTotal number of files: {total_files}")
folder_info_df = pd.DataFrame(folder_info, columns=["folder", "files count"])
folder_info_df.sort_values(["files count"], ascending=False).head()

Total number of folders: 87
Total number of files: 105104


Unnamed: 0,folder,files count
71,69,4187
45,73,3438
31,62,3361
36,68,3313
26,70,3309


In [5]:
print("folder names: ", list(folder_info_df.folder.unique()))

folder names:  ['images', '057', '086', '061', '048', '053', '051', '095', '018', '044', '016', '012', '029', '025', '078', '056', '042', '082', '055', '076', '091', '094', '027', '080', '041', '036', '070', '071', '035', '026', '065', '062', '084', '034', '058', '060', '068', '075', '033', '049', '023', '020', '013', '050', '052', '073', '066', '067', '022', '043', '054', '081', '047', '011', '087', '072', '021', '015', '059', '014', '039', '040', '090', '064', '063', '031', '092', '017', '083', '019', '024', '069', '093', '089', '037', '046', '045', '010', '088', '077', '028', '038', '074', '079', '032', '030', '085']


In [13]:
articles_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv",dtype=str)
customers_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
sample_submission_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv")
transactions_train_df = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv")

In [14]:
articles_df = articles_df[['article_id', 'product_type_name',
       'product_group_name', 
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']]

In [15]:
feature_subset = ['product_group_name', 'product_type_name',
       'graphical_appearance_name', 'colour_group_name',
       'perceived_colour_value_name',
       'perceived_colour_master_name',
       'department_name', 'index_name',
       'index_group_name', 'section_name',
       'garment_group_name']

In [16]:
dum = pd.get_dummies(articles_df, columns=feature_subset)

In [17]:
import pickle 
with open('article_embeddings_from_features.pickle', 'wb') as f:
    pickle.dump(dum, f)

In [18]:
df = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv',
                 usecols = ['customer_id', 'article_id'], dtype=str)
# df = next(df)

In [19]:
df = df.merge(articles_df, on='article_id').drop('article_id', axis=1)


In [20]:
customers = df.groupby('customer_id').sum()


In [22]:
articles_df.to_csv('articles_embeddings_from_features.csv', index=False)


In [23]:
col_list = ['customer_id', 'article_id', 'sales_channel_id']
df1 = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv', usecols=col_list, dtype=str)

In [24]:
ratings = df1.groupby(['customer_id', 'article_id']).count()


In [25]:
df2 = pd.DataFrame(tuple(ratings.index.values))
df2['rating'] = ratings.sales_channel_id.values
df2.columns = ['customer_id', 'article_id', 'rating']

In [26]:
df2.to_csv('ratings.csv')


In [27]:
del df2


In [29]:
seq = df1[['customer_id', 'article_id']].groupby('customer_id')


In [30]:
customers = []
articles = []
for group in seq.groups:
    customers.append(group)
    articles.append(seq.get_group(group).article_id.values.tolist())

In [31]:
seq_df = pd.DataFrame({'customer':customers, 'sequence':articles})


In [32]:
seq_df.to_csv('customer_sequence.csv')

In [35]:
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')
import tensorflow_hub as hub

In [36]:
#@title Load the Universal Sentence Encoder's TF Hub module

module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" #@param ["https://tfhub.dev/google/universal-sentence-encoder/4", "https://tfhub.dev/google/universal-sentence-encoder-large/5"]
model = hub.load(module_url)

In [41]:
path = '/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv'

df3 = pd.read_csv(path, dtype={'article_id':str}).astype(str)

In [42]:
desc = df3['detail_desc'].unique()


In [43]:
%%time
embeds = model(desc)

CPU times: user 12.7 s, sys: 9.28 s, total: 21.9 s
Wall time: 12.1 s


In [44]:
desc_embedding_pairs = {i:embed.numpy() for i, embed in zip(desc, embeds)}

In [45]:
df3 = df3[['article_id', 'detail_desc']]

In [46]:
df3['embeddings'] = df3.detail_desc.map(desc_embedding_pairs)


In [47]:
df3 = df3[['article_id', 'embeddings']]


In [48]:
f = open('article_embeddings_from_text.pickle', 'wb')
pickle.dump(df.values, f)

In [49]:
%%time
customer_embeddings = df.groupby('customer_id').mean()

CPU times: user 3min 17s, sys: 10.5 s, total: 3min 27s
Wall time: 3min 27s


In [50]:
f = open('customer_embeddings_from_text.pickle', 'wb')


In [51]:
pickle.dump(customer_embeddings, f)