# 1. Dependency

In [None]:
!pip install git+https://github.com/mayukh18/reco
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from reco.recommender import FunkSVD
from reco.metrics import rmse
import datetime
from collections import Counter
from datetime import timedelta

# 2. Data preparation

In [None]:
transaction = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv",header=0, dtype={'article_id':str})

transaction["t_dat"] = pd.to_datetime(transaction["t_dat"])

# 3 Rank articles with the Popularity

In [None]:
# in recent 8 weeks
train_pop = transaction.loc[transaction["t_dat"] >= datetime.datetime(2020, 7, 28)]
# train_pop.loc[:, 'd'] = 0

train_pop.loc[:,'num'] = (train_pop["t_dat"].max() - train_pop["t_dat"])
train_pop['pop_factor'] = 1 / (train_pop['num'].dt.days + 1)
popular_items_group = train_pop.groupby(["article_id"])['pop_factor'].sum()
_, popular_items = zip(*sorted(zip(popular_items_group, popular_items_group.keys()))[::-1])

# 4. item2iem

In [None]:
def get_most_freq_next_item(user_group):
    next_items = {}
    for user in tqdm(user_group.keys()):
        items = user_group[user]
        for i,item in enumerate(items[:-1]):
            if item not in next_items:
                next_items[item] = []
#             if item != items[i+1]:
#                 next_items[item].append(items[i+1])
            next_items[item].append(items[i+1])
    
    pred_next = {}
    for item in next_items:
        if len(next_items[item]) >= 5:
            most_common = Counter(next_items[item]).most_common()
            ratio = most_common[0][1]/len(next_items[item])
            if ratio >= 0.1:
                pred_next[item] = most_common[0][0]
            
    return pred_next

In [None]:
# recent 4 weeks;
one_day = timedelta(days=1)
year_month_day = str(transaction["t_dat"].max() - one_day*7*4)[:10].split('-')

year_ = int(year_month_day[0])
month_ = int(year_month_day[1])
day_ = int(year_month_day[2])

user_group = transaction.loc[transaction["t_dat"] >= datetime.datetime(year_, month_, day_)].groupby(["customer_id"])["article_id"].apply(list)
pred_next = get_most_freq_next_item(user_group)
user_group_dict = user_group.to_dict()

## 4.1 FunkSVD

The Singular Value Decomposition (SVD), a method from linear algebra that has been generally used as a dimensionality reduction technique in machine learning. SVD is a matrix factorisation technique, which reduces the number of features of a dataset by reducing the space dimension from N-dimension to K-dimension (where K<N). In the context of the recommender system, the SVD is used as a collaborative filtering technique. It uses a matrix structure where each row represents a user, and each column represents an item. The elements of this matrix are the ratings that are given to items by users.

In [None]:
# recent 16 weeks;
df = transaction.loc[transaction["t_dat"] >= datetime.datetime(2020, 6, 2), ["customer_id", "article_id", "t_dat"]].copy()

#define pop_factor for each article;
df['num'] = (df["t_dat"].max() - df["t_dat"])
df['pop_factor'] = 1 / (df['num'].dt.days + 1)

popular_items_group = df.groupby(["article_id"])['pop_factor'].sum()

df['pop_score'] = 1
df = df.groupby(["customer_id", "article_id"]).sum().reset_index()
df['pop_score'] = df.apply(lambda row: row['pop_score']/popular_items_group[row["article_id"]], axis=1)
#set max score of pop_score to be 5.0
df['pop_score'] = df['pop_score'].apply(lambda x: 5.0 if x>5.0 else x)

In [None]:
df = df[["customer_id", "article_id", 'pop_score']]

# shuffling
df = df.sample(frac=1).reset_index(drop=True)
svd = FunkSVD(k=8, learning_rate=0.008, regularizer = .01, iterations = 80, method = 'stochastic', bias=True)
svd.fit(X=df, formatizer={'user':"customer_id", 'item':"article_id", 'value':'pop_score'},verbose=True)

## 4.2 Recently Purchase most

Get four train sets from the most recent four weeks;

In [None]:
train1 = transaction.loc[(transaction["t_dat"] >= datetime.datetime(2020, 9, 15))]
train2 = transaction.loc[(transaction["t_dat"] >= datetime.datetime(2020, 9, 8)) & (transaction["t_dat"] < datetime.datetime(2020, 9, 15))]
train3 = transaction.loc[(transaction["t_dat"] >= datetime.datetime(2020, 9, 1)) & (transaction["t_dat"] < datetime.datetime(2020, 9, 8))]
train4 = transaction.loc[(transaction["t_dat"] >= datetime.datetime(2020, 8, 25)) & (transaction["t_dat"] < datetime.datetime(2020, 9, 1))]

tmp = train1.groupby(["customer_id","article_id"])["t_dat"].agg('count').reset_index()
tmp.columns = ["customer_id","article_id",'cnt']
train1 = train1.merge(tmp, on = ["customer_id","article_id"], how='left')
train1 = train1.sort_values(["t_dat", 'cnt'],ascending=False)
train1.index = range(len(train1))
positive_items_per_user1 = train1.groupby(["customer_id"])["article_id"].apply(list)


tmp = train2.groupby(["customer_id","article_id"])["t_dat"].agg('count').reset_index()
tmp.columns = ["customer_id","article_id",'cnt']
train2 = train2.merge(tmp, on = ["customer_id","article_id"], how='left')
train2 = train2.sort_values(["t_dat", 'cnt'],ascending=False)
train2.index = range(len(train2))
positive_items_per_user2 = train2.groupby(["customer_id"])["article_id"].apply(list)


tmp = train3.groupby(["customer_id","article_id"])["t_dat"].agg('count').reset_index()
tmp.columns = ["customer_id","article_id",'cnt']
train3 = train3.merge(tmp, on = ["customer_id","article_id"], how='left')
train3 = train3.sort_values(["t_dat", 'cnt'],ascending=False)
train3.index = range(len(train3))
positive_items_per_user3 = train3.groupby(["customer_id"])["article_id"].apply(list)


tmp = train4.groupby(["customer_id","article_id"])["t_dat"].agg('count').reset_index()
tmp.columns = ["customer_id","article_id",'cnt']
train4 = train4.merge(tmp, on = ["customer_id","article_id"], how='left')
train4 = train4.sort_values(["t_dat", 'cnt'],ascending=False)
train4.index = range(len(train4))
positive_items_per_user4 = train4.groupby(["customer_id"])["article_id"].apply(list)

In [None]:
sub = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

# 5. Get Result

Recursively get the customer in all customers to find whether they are in these four train sets, and according to the result, give corresponding prediction; If not, then give the most 12 popular articles as their prediction;

In [None]:
result = []

userindexes = {svd.users[i]:i for i in range(len(svd.users))}
for user in tqdm(sub["customer_id"].unique()):
    user_output = []
    if user in positive_items_per_user1.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user1[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    elif user in positive_items_per_user2.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user2[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    elif user in positive_items_per_user3.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user3[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
        
    elif user in positive_items_per_user4.keys():
        most_common_items_of_user = {k:v for k, v in Counter(positive_items_per_user4[user]).most_common()}
        user_index = userindexes[user]
        new_order = {}
        for k in list(most_common_items_of_user.keys())[:20]:
            try:
                itemindex = svd.items.index(k)
                pred_value = np.dot(svd.userfeatures[user_index], svd.itemfeatures[itemindex].T) + svd.item_bias[0, itemindex]
            except:
                pred_value = most_common_items_of_user[k]
            new_order[k] = pred_value
        user_output += [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
    
    if user in user_group_dict:
        item_his = user_group_dict[user][::-1]
        for item in item_his:
            if item in pred_next and pred_next[item] not in user_output:
                user_output += [pred_next[item]]
    if len(user_output) > 12:
        user_output = user_output[:12]
        
    if len(user_output) < 12:
        user_output += list(popular_items[:12 - len(user_output)])
    
    assert(len(user_output) == 12) 
    user_output = ' '.join(user_output)
    result.append([user, user_output])

## 5.1 Result to Dataframe

In [None]:
result = pd.DataFrame(result)
result.columns = ["customer_id", 'prediction']
result

## 5.2 Result Submission

In [None]:
result.to_csv("submission.csv", index=False)