In [1]:
!pip install --upgrade implicit

Collecting implicit
  Downloading implicit-0.6.1-cp37-cp37m-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: implicit
  Attempting uninstall: implicit
    Found existing installation: implicit 0.4.4
    Uninstalling implicit-0.4.4:
      Successfully uninstalled implicit-0.4.4
Successfully installed implicit-0.6.1
[0m

In [20]:
from scipy.sparse import csr_matrix
import implicit
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from scipy import sparse
import pickle

# from utils import _ndcg_calculator, ndcg_calculator

In [3]:
# utils.py

def _ndcg_calculator(gt, rec, idcg):
    dcg = 0.0
    for i, r in enumerate(rec):
        if r in gt:
            dcg += 1.0 / np.log(i + 2)
    return dcg / idcg

def ndcg_calculator(answer, submission, n):
    idcg = sum((1.0 / np.log(i + 1) for i in range(1, n + 1)))

    assert (answer.profile_id != submission.profile_id).sum() == 0

    ndcg_list = []
    for (_, row_answer), (_, row_submit) in zip(answer.iterrows(), submission.iterrows()):
        ndcg_list.append(_ndcg_calculator(row_answer.album_id, row_submit.album_id, idcg))

    ndcg_score = sum(ndcg_list) / len(answer)
    return ndcg_score

n = 25

In [39]:
test_answer_week = pd.read_parquet("../input/lg-train-test/test_answer_week.parquet")
test_answer_month = pd.read_parquet("../input/lg-train-test/test_answer_month.parquet")

train_week = pd.read_parquet("../input/lg-train-test/train_week.parquet")
df_train_month = pd.read_parquet("../input/lg-train-test/train_month.parquet")

sample_sumbission_week = pd.read_parquet("../input/lg-train-test/sample_sumbission_week.parquet")
sample_sumbission_month = pd.read_parquet("../input/lg-train-test/sample_sumbission_month.parquet")

train_df = train_week.copy()
train_month = df_train_month.copy()
mf_sumbission_week = sample_sumbission_week.copy()
mf_sumbission_month = sample_sumbission_month.copy()

In [40]:
ALL_USERS = train_df['profile_id'].unique().tolist()
ALL_ITEMS = train_df['album_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

train_df['profile_id'] = train_df['profile_id'].map(user_map)
train_df['album_id'] = train_df['album_id'].map(item_map)

In [41]:
row = train_df['profile_id'].values
col = train_df['album_id'].values
data = np.ones(train_df.shape[0])
csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
csr_train

<8057x20383 sparse matrix of type '<class 'numpy.float64'>'
	with 360348 stored elements in Compressed Sparse Row format>

In [42]:
def train(csr_train, factors=200, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model

In [50]:
def submit(model, csr_train, sample_sumbission_week):  #default week_train set
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=25, filter_already_liked_items=False)
        for i, profile_id in enumerate(batch):
            profile_id = user_ids[profile_id]
            user_items = ids[i]
            album_ids = [item_ids[item_id] for item_id in user_items] #
            pred_df.append({'profile_id':profile_id,'album_id':album_ids})

    pred_dfs = pd.DataFrame(pred_df)
#     sample_sumbission_week.drop(columns='album_id', inplace=True)
    sample_sumbission_week = sample_sumbission_week.merge(pred_dfs, on='profile_id')
    
    return sample_sumbission_week

In [51]:
def validate(csr_train, factors=200, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    df_preds = submit(model, csr_train, sample_sumbission_week)
    ndcg = ndcg_calculator(test_answer_week, df_preds, n)
#     ndcg = ndcg_calculator(test_answer_week, sample_sumbission_week, n=25)  # submission 여기서 저장 안되어서 정의 불가능 
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> ndcg@25: {ndcg:6.5f}")
    return ndcg

# df_preds = submit(model, csr_train, sample_sumbission_week)
# mf_week_ndcg = ndcg_calculator(test_answer_week, df_preds, n)

In [None]:
%%time
best_ndcg25 = 0
for factors in [30, 50, 100, 200, 500, 1000]:
    for iterations in [3, 5, 10, 15, 20]:
        for regularization in [0.01, 0.02, 0.05, 0.1]:
            ndcg25 = validate(csr_train, factors, iterations, regularization, show_progress=False)
            if ndcg25 > best_ndcg25:
                best_ndcg25 = ndcg25
                best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
                print(f"Best ndcg@25 found. Updating: {best_params}")

Factors:  30 - Iterations:  3 - Regularization: 0.010 ==> ndcg@25: 0.08986
Best ndcg@25 found. Updating: {'factors': 30, 'iterations': 3, 'regularization': 0.01}
Factors:  30 - Iterations:  3 - Regularization: 0.020 ==> ndcg@25: 0.08987
Best ndcg@25 found. Updating: {'factors': 30, 'iterations': 3, 'regularization': 0.02}
Factors:  30 - Iterations:  3 - Regularization: 0.050 ==> ndcg@25: 0.08998
Best ndcg@25 found. Updating: {'factors': 30, 'iterations': 3, 'regularization': 0.05}
Factors:  30 - Iterations:  3 - Regularization: 0.100 ==> ndcg@25: 0.08974
Factors:  30 - Iterations:  5 - Regularization: 0.010 ==> ndcg@25: 0.08904
Factors:  30 - Iterations:  5 - Regularization: 0.020 ==> ndcg@25: 0.08898
Factors:  30 - Iterations:  5 - Regularization: 0.050 ==> ndcg@25: 0.08904
Factors:  30 - Iterations:  5 - Regularization: 0.100 ==> ndcg@25: 0.08890
Factors:  30 - Iterations: 10 - Regularization: 0.010 ==> ndcg@25: 0.08916
Factors:  30 - Iterations: 10 - Regularization: 0.020 ==> ndcg@2

In [None]:
best_params
###############################
#현재 최고 성능 (week valid 기준)
### factors=200, iterations=3, regularization=0.05  ==> LB 0.2275 / ndcg 0.11312

In [56]:
model = train(csr_train)

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
df_preds = submit(model, csr_train, sample_sumbission_week)

# experiment with full dataset 

In [11]:
## MF whole
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import logging

history = pd.read_csv("/kaggle/input/lgground/history_data.csv")
sub=pd.read_csv('../input/lgground/sample_submission.csv')
import gc
gc.collect()

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(level=logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
history_mf = history.copy() 

def extract_data(base_path):
    df_history = pd.read_csv("")
    logger.info("History are extracted")
    return df_history

def preprocess_date(df_history):
    ## 날짜 전처리
    df_history = df_history.assign(log_dt = pd.to_datetime(df_history.log_time//100, format="%Y%m%d%H%M"))
    df_history = df_history.assign(log_date = df_history.log_dt.dt.floor("D"))
    df_history = df_history.drop("log_time", axis=1)
    logger.info("Datetime preprocess completed")
    return df_history

def user_item_maps(df):
    global ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map
    ALL_USERS = df['profile_id'].unique().tolist()
    ALL_ITEMS = df['album_id'].unique().tolist()

    user_ids = dict(list(enumerate(ALL_USERS)))
    item_ids = dict(list(enumerate(ALL_ITEMS)))

    user_map = {u: uidx for uidx, u in user_ids.items()}
    item_map = {i: iidx for iidx, i in item_ids.items()}

    df['profile_id'] = df['profile_id'].map(user_map)
    df['album_id'] = df['album_id'].map(item_map)
    return ALL_USERS, ALL_ITEMS, user_ids, item_ids, user_map, item_map

def make_csr_matrix(df):
    row = df['profile_id'].values
    col = df['album_id'].values
    data = np.ones(df.shape[0])
    csr_train = csr_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return csr_train


def train(csr_train, factors=200, iterations=3, regularization=0.05, show_progress=True):
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(csr_train, show_progress=show_progress)
    return model


def real_submit(model, csr_train, sub):  
    preds = []
    batch_size = 2000
    to_generate = np.arange(len(ALL_USERS))
    pred_df = []
    for startidx in range(0, len(to_generate), batch_size):
        batch = to_generate[startidx : startidx + batch_size]
        ids, scores = model.recommend(batch, csr_train[batch], N=25, filter_already_liked_items=False)
        for i, profile_id in enumerate(batch):
            profile_id = user_ids[profile_id]
            user_items = ids[i]
            album_ids = [item_ids[item_id] for item_id in user_items] 
            pred_df.append({'profile_id':profile_id,'predicted_list':album_ids})
    pred_dfs = pd.DataFrame(pred_df)    
#     sub = sub.merge(pred_dfs, on='profile_id')
    return pred_dfs

def week_day_feature(df_train:pd.DataFrame())->pd.DataFrame():
    df_train['week'] = df_train['log_date'].apply(lambda x: x.isocalendar()[1])
    df_train['day'] = df_train['log_date'].apply(lambda x: x.isocalendar()[2])
    week_min = df_train.week.min()
    df_train['week'] = df_train['week'].apply(lambda x: x-week_min)
    
    return  df_train

# album_cnt & album_rank feature engineering
def album_cnt_rank_feature(df_train:pd.DataFrame())->pd.DataFrame():
    album_cnt = df_train.album_id.value_counts().reset_index().rename(columns={'index':'album_id','album_id':'album_cnt'})
    album_cnt['rank'] = album_cnt['album_cnt'].rank(method='first', ascending=False)
    df_train = df_train.merge(album_cnt, on='album_id')
    
    return df_train

def feature_engineering(df_train:pd.DataFrame())->pd.DataFrame():
    df_train = week_day_feature(df_train)
    df_train = album_cnt_rank_feature(df_train)
    
    return df_train

In [12]:
user_item_maps(history_mf)
mf_csr = make_csr_matrix(history_mf)
mf_model = train(mf_csr)
mf_preds = real_submit(mf_model, mf_csr, sub)

  "OpenBLAS detected. Its highly recommend to set the environment variable "


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
mf_preds['predicted_list'] = mf_preds['predicted_list'].apply(lambda x: str(x))

In [None]:
mf_preds.to_csv('als.csv', index=False)

## experiment buy + history + search 
추가해서 성능이 올라갈까? 과연? 

In [None]:
# LGBM Ranker 작업 마친 후에 진행

# Neg Sampling - full data

In [21]:
# user factor 추출 
k=200
user_factors = pd.DataFrame(mf_model.user_factors)
user_factors.columns = [f'profile_id_{i}' for i in range(k)]
user_factors.index = user_factors.index.map(user_ids)
user_factors = user_factors.reset_index().rename(columns={'index':'profile_id'})
user_factors.to_parquet("profile_factor_als.parquet")


# item factor 추출
item_factors = pd.DataFrame(mf_model.item_factors)
item_factors.columns = [f'album_id_{i}' for i in range(k)]
item_factors.index = item_factors.index.map(item_ids)
item_factors = item_factors.reset_index().rename(columns={'index':'album_id'})

item_factors.to_parquet("album_factor_als.parquet")

In [22]:
sparse.save_npz("train_matrix.npz", mf_csr)
pickle.dump(mf_model, open("als.pkl", 'wb'))

In [23]:
negative_sample_df_list = []
for target_user_uid in tqdm(history.profile_id.unique()):
    target_user_idx = user_map.get(target_user_uid)
    
    clicked_item_iid = history.loc[history.profile_id == target_user_uid, 'album_id']
    
    target_reco = [iid for iid in mf_preds.loc[target_user_idx][1]]
    target_negative_samples = target_reco[-(len(clicked_item_iid)*9):]
    target_negative_samples = [i for i in target_negative_samples if i not in set(clicked_item_iid)]
    target_negative_sample_df = pd.DataFrame({"profile_id":target_user_uid, "album_id":target_negative_samples})
    
#     assert len(target_negative_samples) == (len(clicked_item_iid) * 2)
#     assert len(set(clicked_item_iid) & set(target_negative_samples)) == 0
    
    negative_sample_df_list.append(target_negative_sample_df)

100%|██████████| 8311/8311 [00:24<00:00, 338.21it/s]


In [25]:
negative_sample_df = pd.concat(negative_sample_df_list).reset_index(drop=True)
negative_sample_df['album_id'] = negative_sample_df['album_id'].apply(lambda x: int(x))

In [35]:
negative_sample_df.to_pickle("negative_sample_als.pkl")

In [33]:
negative_sample_df

Unnamed: 0,profile_id,album_id
0,3,38
1,3,224
2,3,39
3,3,225
4,3,229
...,...,...
90810,33032,124
90811,33032,1402
90812,33032,1438
90813,33032,1920


# Neg Sampling - week data

In [None]:
# user factor 추출 
k=200
user_factors = pd.DataFrame(mf_model.user_factors)
user_factors.columns = [f'profile_id_{i}' for i in range(k)]
user_factors.index = user_factors.index.map(user_ids)
user_factors = user_factors.reset_index().rename(columns={'index':'profile_id'})
user_factors.to_parquet("profile_factor_als.parquet")


# item factor 추출
item_factors = pd.DataFrame(mf_model.item_factors)
item_factors.columns = [f'album_id_{i}' for i in range(k)]
item_factors.index = item_factors.index.map(item_ids)
item_factors = item_factors.reset_index().rename(columns={'index':'album_id'})

item_factors.to_parquet("album_factor_als.parquet")

In [None]:
sparse.save_npz("train_matrix.npz", mf_csr)
pickle.dump(mf_model, open("als.pkl", 'wb'))

In [None]:
week_negative_sample_df_list = []
for target_user_uid in tqdm(history.profile_id.unique()):
    target_user_idx = user_map.get(target_user_uid)
    
    clicked_item_iid = history.loc[history.profile_id == target_user_uid, 'album_id']
    
    target_reco = [iid for iid in mf_preds.loc[target_user_idx][1]]
    target_negative_samples = target_reco[-(len(clicked_item_iid)*9):]
    target_negative_samples = [i for i in target_negative_samples if i not in set(clicked_item_iid)]
    target_negative_sample_df = pd.DataFrame({"profile_id":target_user_uid, "album_id":target_negative_samples})
    
#     assert len(target_negative_samples) == (len(clicked_item_iid) * 2)
#     assert len(set(clicked_item_iid) & set(target_negative_samples)) == 0
    
    week_negative_sample_df_list.append(target_negative_sample_df)

In [None]:
week_negative_sample_df = pd.concat(week_negative_sample_df_list).reset_index(drop=True)
week_negative_sample_df['album_id'] = week_negative_sample_df['album_id'].apply(lambda x: int(x))

In [None]:
week_negative_sample_df.to_pickle("week_negative_sample_als.pkl")