In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
os.environ['OPENBLAS_NUM_THREADS']='1'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k

In [None]:

base_path = '../input/h-and-m-personalized-fashion-recommendations/'
csv_train = f'{base_path}transactions_train.csv'
csv_sub = f'{base_path}sample_submission.csv'
csv_users = f'{base_path}customers.csv'
csv_items = f'{base_path}articles.csv'

df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
df_sub = pd.read_csv(csv_sub)
dfu = pd.read_csv(csv_users)
dfi = pd.read_csv(csv_items, dtype={'article_id': str})

In [None]:
df

In [None]:
dfu

In [None]:
df['t_dat'] > '2020-08-21'

In [None]:
# 20년 8월 21일 이후 데이터셋만 사용
df = df[df['t_dat'] > '2020-08-21']
df.shape

In [None]:
df

In [None]:
df['t_dat']

In [None]:
df['t_dat'].max()

In [None]:
# 사용자와 항목 모두에 0부터 시작하는 자동 증분 ID 할당

# 사용자 ID -> 중복 없이
ALL_USERS = dfu['customer_id'].unique().tolist()

# 물건 ID -> 중복 없이
ALL_ITEMS = dfi['article_id'].unique().tolist()


In [None]:
# 인덱스 할당
user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

In [None]:
# 아이디 : 인덱스 순서로 매핑
user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

In [None]:
user_map

In [None]:
df.head()

In [None]:
df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)

del dfu, dfi

In [None]:
df.shape

#### 문제의 주요 핵심은 일반적으로 사용자와 항목이있는 행렬로 모델링되며 사용자가 항목을 구입했는지 (또는 좋아하는지)를 나타내는 값을 사용하기 때문에 추천 시스템에서 scipy 희소 행렬을 사용하는 것이 일반적입니다.

In [None]:
## CSR Matrices
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train

# 데이터 작동 확인

In [None]:
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)

# 검증

In [None]:
df.shape[0]

In [None]:
len(np.ones(df.shape[0]))

In [None]:
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = np.ones(df.shape[0])
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo

In [None]:
df['t_dat'].max()

In [None]:
pd.Timedelta?

In [None]:
pd.Timedelta?

In [None]:
validation_days = 7
validation_cut = df['t_dat'].max() - pd.Timedelta(days = validation_days)
print(validation_cut)

In [None]:
df[df['t_dat'] < validation_cut]

In [None]:
df[df['t_dat'] >= validation_cut]

In [None]:
# 데이터셋을 학습, 검증 셋으로 분류 
def split_data(df, validation_days=7):
    validation_cut = df['t_dat'].max() - pd.Timedelta(days=validation_days)

    df_train = df[df['t_dat'] < validation_cut]
    df_val = df[df['t_dat'] >= validation_cut]
    return df_train, df_val


In [None]:
# 위 함수들을 사용해서 학습, 검증 세트에 대한 매트릭스 생성
def get_val_matrices(df, validation_days=7):

    df_train, df_val = split_data(df, validation_days=validation_days)
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }

In [None]:
def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@12 to a library that allows repeated items in prediction
    print(f"csr_train= {csr_train}")
    print(f"csr_val= {csr_val}")
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12

In [None]:
matrices = get_val_matrices(df)

In [None]:
df

In [None]:
factors=200
iterations=20
regularization=0.01
show_progress=True

In [None]:
matrices

In [None]:
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
    for iterations in [3, 12, 14, 15, 20]:
        for regularization in [0.01]:
            map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
            if map12 > best_map12:
                best_map12 = map12
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best MAP@12 found. Updating: {best_params}")

In [None]:
del matrices

In [None]:
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()

In [None]:
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    return model

In [None]:
model = train(coo_train, **best_params)