In [6]:
!pip install --upgrade implicit

[0m

In [7]:
from scipy.sparse import csr_matrix
import implicit
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# from utils import _ndcg_calculator, ndcg_calculator

In [8]:
# utils.py

def _ndcg_calculator(gt, rec, idcg):
    dcg = 0.0
    for i, r in enumerate(rec):
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
    return dcg / idcg

def ndcg_calculator(answer, submission, n):
    idcg = sum((1.0 / np.log(i + 1) for i in range(1, n + 1)))

    assert (answer.profile_id != submission.profile_id).sum() == 0

    ndcg_list = []
    for (_, row_answer), (_, row_submit) in zip(answer.iterrows(), submission.iterrows()):
        ndcg_list.append(_ndcg_calculator(row_answer.album_id, row_submit.album_id, idcg))

    ndcg_score = sum(ndcg_list) / len(answer)
    return ndcg_score

n = 25

In [9]:
test_answer_week = pd.read_parquet("../input/lg-train-test/test_answer_week.parquet")
test_answer_month = pd.read_parquet("../input/lg-train-test/test_answer_month.parquet")

train_week = pd.read_parquet("../input/lg-train-test/train_week.parquet")
df_train_month = pd.read_parquet("../input/lg-train-test/train_month.parquet")

sample_sumbission_week = pd.read_parquet("../input/lg-train-test/sample_sumbission_week.parquet")
sample_sumbission_month = pd.read_parquet("../input/lg-train-test/sample_sumbission_month.parquet")

train_df = train_week.copy()
train_month = df_train_month.copy()
mf_sumbission_week = sample_sumbission_week.copy()
mf_sumbission_month = sample_sumbission_month.copy()

In [10]:
ALL_USERS = train_df['profile_id'].unique().tolist()
ALL_ITEMS = train_df['album_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

train_df['profile_id'] = train_df['profile_id'].map(user_map)
train_df['album_id'] = train_df['album_id'].map(item_map)

In [11]:
row = train_df['profile_id'].values
col = train_df['album_id'].values
data = np.ones(train_df.shape[0])
csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
csr_train

<8057x20383 sparse matrix of type '<class 'numpy.float64'>'
	with 360348 stored elements in Compressed Sparse Row format>

In [12]:
def train(csr_train, factors=200, iterations=5, regularization=0.01, show_progress=True):
    model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model

In [20]:
def submit(model, csr_train, sample_sumbission_week):  # month 돌릴 때는 혼동 없도록 인자 잘 전달하기
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=25, filter_already_liked_items=False)
        for i, profile_id in enumerate(batch):
            profile_id = user_ids[profile_id]
            user_items = ids[i]
            album_ids = [item_ids[item_id] for item_id in user_items] #
            pred_df.append({'profile_id':profile_id,'album_id':album_ids})

    pred_dfs = pd.DataFrame(pred_df)
#     sample_sumbission_week.drop(columns='album_id', inplace=True)
    sample_sumbission_week = sample_sumbission_week.merge(pred_dfs, on='profile_id')
    
    return sample_sumbission_week

In [21]:
def validate(csr_train, factors=200, iterations=20, regularization=0.01, show_progress=True):
    model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    df_preds = submit(model, csr_train, sample_sumbission_week)
    ndcg = ndcg_calculator(test_answer_week, df_preds, n)
#     ndcg = ndcg_calculator(test_answer_week, sample_sumbission_week, n=25)  # submission 여기서 저장 안되어서 정의 불가능 
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> ndcg@25: {ndcg:6.5f}")
    return ndcg

# df_preds = submit(model, csr_train, sample_sumbission_week)
# mf_week_ndcg = ndcg_calculator(test_answer_week, df_preds, n)

In [22]:
%%time
best_ndcg25 = 0
for factors in [40, 50, 60, 100, 150, 200, 300, 400, 500, 1000]:
    for iterations in [3, 5, 7, 10, 12, 14, 15, 20]:
        for regularization in [0.01, 0.02, 0.05, 0.1, 0.2]:
            ndcg25 = validate(csr_train, factors, iterations, regularization, show_progress=False)
            if ndcg25 > best_ndcg25:
                best_ndcg25 = ndcg25
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best ndcg@25 found. Updating: {best_params}")

Factors:  40 - Iterations:  3 - Regularization: 0.010 ==> ndcg@25: 0.03518
Best ndcg@25 found. Updating: {'factors': 40, 'iterations': 3, 'regularization': 0.01}
Factors:  40 - Iterations:  3 - Regularization: 0.020 ==> ndcg@25: 0.03493
Factors:  40 - Iterations:  3 - Regularization: 0.050 ==> ndcg@25: 0.03397
Factors:  40 - Iterations:  3 - Regularization: 0.100 ==> ndcg@25: 0.02894
Factors:  40 - Iterations:  3 - Regularization: 0.200 ==> ndcg@25: 0.02728
Factors:  40 - Iterations:  5 - Regularization: 0.010 ==> ndcg@25: 0.03106
Factors:  40 - Iterations:  5 - Regularization: 0.020 ==> ndcg@25: 0.03067
Factors:  40 - Iterations:  5 - Regularization: 0.050 ==> ndcg@25: 0.02974
Factors:  40 - Iterations:  5 - Regularization: 0.100 ==> ndcg@25: 0.02836
Factors:  40 - Iterations:  5 - Regularization: 0.200 ==> ndcg@25: 0.02577
Factors:  40 - Iterations:  7 - Regularization: 0.010 ==> ndcg@25: 0.03198
Factors:  40 - Iterations:  7 - Regularization: 0.020 ==> ndcg@25: 0.03027
Factors:  40 

In [23]:
best_params

{'factors': 50, 'iterations': 20, 'regularization': 0.01}

In [None]:
model = train(csr_train)

In [None]:
df_preds = submit(model, csr_train, sample_sumbission_week)

# experience with full dataset 

In [None]:
# week best Factors: 1000 - Iterations:  3 - Regularization: 0.100 ==> LB 0.2266 / ndcg 0.11888 
''' 현재 최고 성능 (week valid 기준)
### factors=200, iterations=3, regularization=0.05  ==> LB 0.2275 / ndcg 0.11312
    정확히 2배의 결과. 오히려 자체측정 최고 param보다 잘 나왔다.
    factor가 많다고 좋은 결과가 나오는 것은 아니다.
    recent한 log까지 최대한 반영하는 게 더 성능이 좋은가? (month의 200개 factor에서의 실험이 필요할까?)
의문 및 실험점 : 
같은 param으로 전체 데이터셋이 아닌 마지막 7주 데이터에 대한 학습을 해서 submit해보면 어떨까?
'''

In [29]:
## MF whole
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import logging

history = pd.read_parquet("/kaggle/input/history-for-lgbm/df_train.parquet")
sub=pd.read_csv('../input/lgground/sample_submission.csv')
import gc
gc.collect()

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
history_mf = history.copy() 

def extract_data(base_path):
    df_history = pd.read_csv("")
    logger.info("History are extracted")
    return df_history


def preprocess_date(df_history):
    ## 날짜 전처리
    df_history = df_history.assign(log_dt = pd.to_datetime(df_history.log_time//100, format="%Y%m%d%H%M"))
    df_history = df_history.assign(log_date = df_history.log_dt.dt.floor("D"))
    df_history = df_history.drop("log_time", axis=1)
    logger.info("Datetime preprocess completed")
    return df_history

def user_item_maps(df):
    global ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map
    ALL_USERS = df['profile_id'].unique().tolist()
    ALL_ITEMS = df['album_id'].unique().tolist()

    user_ids = dict(list(enumerate(ALL_USERS)))
    item_ids = dict(list(enumerate(ALL_ITEMS)))

    user_map = {u: uidx for uidx, u in user_ids.items()}
    item_map = {i: iidx for iidx, i in item_ids.items()}

    df['profile_id'] = df['profile_id'].map(user_map)
    df['album_id'] = df['album_id'].map(item_map)
    return ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map

def make_csr_matrix(df):
    row = df['profile_id'].values
    col = df['album_id'].values
    data = np.ones(df.shape[0])
    csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return csr_train


def train(csr_train, factors=50, iterations=20, regularization=0.01, show_progress=True):
    model = implicit.bpr.BayesianPersonalizedRanking(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model

# week best :: Factors: 1000 - Iterations:  3 - Regularization: 0.100 ==> ndcg@25: 0.11888

def real_submit(model, csr_train, sub):  
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=25, filter_already_liked_items=False)
        for i, profile_id in enumerate(batch):
            profile_id = user_ids[profile_id]
            user_items = ids[i]
            album_ids = [item_ids[item_id] for item_id in user_items] 
            pred_df.append({'profile_id':profile_id,'predicted_list':album_ids})
    pred_dfs = pd.DataFrame(pred_df)    
#     sub = sub.merge(pred_dfs, on='profile_id')
    return pred_dfs

def week_day_feature(df_train:pd.DataFrame())->pd.DataFrame():
    df_train['week'] = df_train['log_date'].apply(lambda x: x.isocalendar()[1])
    df_train['day'] = df_train['log_date'].apply(lambda x: x.isocalendar()[2])
    week_min = df_train.week.min()
    df_train['week'] = df_train['week'].apply(lambda x: x-week_min)
    
    return  df_train

# album_cnt & album_rank feature engineering
def album_cnt_rank_feature(df_train:pd.DataFrame())->pd.DataFrame():
    album_cnt = df_train.album_id.value_counts().reset_index().rename(columns={'index':'album_id','album_id':'album_cnt'})
    album_cnt['rank'] = album_cnt['album_cnt'].rank(method='first', ascending=False)
    df_train = df_train.merge(album_cnt, on='album_id')
    
    return df_train

def feature_engineering(df_train:pd.DataFrame())->pd.DataFrame():
    df_train = week_day_feature(df_train)
    df_train = album_cnt_rank_feature(df_train)
    
    return df_train

In [30]:
user_item_maps(history_mf)
mf_csr = make_csr_matrix(history_mf)
mf_model = train(mf_csr)
mf_preds = real_submit(mf_model, mf_csr, sub)

  0%|          | 0/20 [00:00<?, ?it/s]

In [31]:
mf_preds

Unnamed: 0,profile_id,predicted_list
0,3,"[16, 15, 38, 416, 19, 18, 17, 981, 175, 186, 2..."
1,5,"[16, 15, 38, 17, 224, 18, 19, 225, 416, 178, 2..."
2,7,"[124, 241, 347, 1880, 2054, 16, 0, 125, 65, 17..."
3,12,"[16, 241, 347, 124, 264, 2054, 175, 0, 191, 41..."
4,16,"[124, 241, 125, 347, 65, 1880, 329, 339, 127, ..."
...,...,...
8306,33022,"[16, 241, 347, 124, 2054, 0, 264, 175, 1880, 1..."
8307,33023,"[124, 125, 65, 190, 241, 339, 1880, 127, 347, ..."
8308,33026,"[16, 241, 124, 347, 2054, 264, 0, 175, 1880, 4..."
8309,33027,"[124, 241, 125, 65, 347, 1880, 329, 339, 127, ..."


In [32]:
mf_preds['predicted_list'] = mf_preds['predicted_list'].apply(lambda x: str(x))

In [None]:
mf_preds.to_csv('als-7.csv', index=False)