# **Download Data**

In [1]:
import pandas as pd
import os

fname_tran ='../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'
fname_cus ='../input/h-and-m-personalized-fashion-recommendations/customers.csv'
fname_article ='../input/h-and-m-personalized-fashion-recommendations/articles.csv'

In [2]:
data = pd.read_csv(fname_tran, sep=',')
data

# **Data Preprocessing**

In [5]:
data.duplicated().sum() #remove duplicated rows

In [6]:
data = data.drop_duplicates()
data

In [4]:
data = data[(data['t_dat'] > "2020-06-22")] #filter latest 3 months data only for model building

In [7]:
using_cols = ['customer_id', 'article_id', 'price']
data = data[using_cols]
data.head(10)

In [8]:
# same customer buy same items -> increase count value 
data.loc[:,'count'] = 1

In [9]:
data = data.groupby(['customer_id','article_id'], as_index=False).sum()

In [10]:
data

In [13]:
user_unique = data['customer_id'].unique() #assign user & article id with a unique numerical value
article_unique = data['article_id'].unique()

In [14]:
user_unique

In [15]:
user_to_idx = {v:k for k,v in enumerate(user_unique)}
article_to_idx = {v:k for k,v in enumerate(article_unique)}

In [17]:
temp_user_data = data['customer_id'].map(user_to_idx.get).dropna() #map the ids to original dataframe

if len(temp_user_data) == len(data):  
    print('no-null')
    data['customer_id'] = temp_user_data   
else:
    print('detect null')

temp_artist_data = data['article_id'].map(article_to_idx.get).dropna()
if len(temp_artist_data) == len(data):
    print('no-null')
    data['article_id'] = temp_artist_data
else:
    print('detect null')

data

In [18]:
data['count'].value_counts()

# Model Building

In [19]:
from scipy.sparse import csr_matrix

num_user = data['customer_id'].nunique()
num_artist = data['article_id'].nunique()

csr_data = csr_matrix((data['count'], (data.customer_id, data.article_id)), shape= (num_user, num_artist))
csr_data

In [20]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

In [23]:
# os.environ['OPENBLAS_NUM_THREADS']='1'
# os.environ['KMP_DUPLICATE_LIB_OK']='True'
# os.environ['MKL_NUM_THREADS']='1'
#article has 12 feature, customer estimatley has 6 and transaction data has 5
als_model = AlternatingLeastSquares(factors=360, regularization=0.01, use_gpu=True, iterations=5, dtype=np.float32, calculate_training_loss=True)

In [24]:
csr_data_transpose = csr_data.T
csr_data_transpose

In [25]:
history = als_model.fit(csr_data_transpose)

In [31]:
#check model with an testing user id
user = user_to_idx['00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657']
article_recommended = als_model.recommend(user, csr_data, N=12, filter_already_liked_items=True)
article_recommended

In [32]:
k = [str(idx_to_article[i[0]]) for i in article_recommended]
print(' '.join(k))

In [33]:
from IPython.display import Image
num = 1
Image(f'../input/h-and-m-personalized-fashion-recommendations/images/0{str(k[num])[:2]}/0{int(k[num])}.jpg' , width = 200)

# **Output result to csv**

In [34]:
data_R = pd.read_csv(fname_tran, sep=',')

In [36]:
submission=pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [37]:
data_R['customer_id']

In [38]:
users_sub = submission.drop_duplicates(['customer_id'], keep = 'first', ignore_index = True)

In [40]:
users_sub

In [41]:
if users_sub['customer_id'].head(1).to_numpy()[0] in user_to_idx:
    print("contain")

In [43]:
users_a = users_sub['customer_id']

In [44]:
import time #Takes around 2 hours to download full results

def run():
    result_list = []
    num = 0
    time_count = 0

    for user in users_a:
        start = time.time()

        if user not in user_to_idx:
            user_id = 0
        else:
            user_id = user_to_idx[user]

        article_recommended = als_model.recommend(user_id, csr_data, N=12, filter_already_liked_items=False)
        
        lists = ['0'+str(idx_to_article[i[0]]) for i in article_recommended]
        recommand_list = ' '.join(lists)
        
        result_list.append(recommand_list)
        end = time.time()
        time_count = time_count + (end - start)

        if num%1000 == 0 :
            print ('.' , end = ' ')
        if num%10000 == 0:
            time_left = ((len(users_a)-num) / 10000) * time_count / 60
            print(f"{time_count:.5f} sec / TIME_LEFT(min): ",time_left)
            time_count = 0
        num = num +1
    return result_list

In [45]:
result_lists = run()

In [46]:
final_list = [result_lists[i] if i<len(result_lists) else '0110065001 0110065001 0699080001 0838055001 0351484027 0351484033 0458543009 0715303001 0759871001 0566140001 0351484026 0399223029' for i in range(len(submission))]

In [47]:
submission['customer_id'] = users_sub['customer_id']
submission['prediction'] = final_list
submission

In [49]:
submission.to_csv('prediction_3month.csv', index=False)

<a href="./prediction_3month.csv"> Download File </a>