In [28]:
import os
import ast
import pickle
import time
from tqdm import tqdm
import json
from glob import glob
from collections import Counter, defaultdict
import sys
sys.path.insert(0, '../')

import pandas as pd
import numpy as np
from scipy import sparse

from load_data import load_user_time_read, load_raw
from tfidf import TFIDFGenerator, get_df
from config import Config
from utils import iterate_data_files, squeeze, save_as_pickle
from preprocessing import PostIdEncoder
from utils import squeeze

In [3]:
dev = list(map(lambda x: x[:-1], open('../raw/predict/dev.users', 'r').readlines()))

In [4]:
start = time.time()
user_time_read = load_user_time_read(root_dir='../preprocessed/user_time_read.json')
post_encoder = PostIdEncoder(root_dir='../encodings')
tfidf_generator = TFIDFGenerator('../tfidf')
print(f'Loading time: {time.time() - start}')

Loading time: 21.501490354537964


In [147]:
def filter_read_by_time(history, start, end):
    history_filtered = list(filter(lambda x: (int(start) <= int(x[0].split('_')[0])) and (int(end) >= int(x[0].split('_')[-1])), history))
    return history_filtered

start = Config.train_start
end = Config.train_end
n_splits = 5
batch_size = len(dev) // n_splits
post_meta_id = []

for i in range(n_splits):
    dev_user_batch = dev[i*batch_size:] if i == n_splits-1 else dev[i*batch_size:(i+1)*batch_size]
    posts_raw = [] # 유저들의 로그에서 등장한 모든 글을 담을 리스트
    user_preferences_raw = [] # 유저들의 feature 벡터를 담을 리스트

    for user_id in tqdm(dev_user_batch, desc=f'Getting user prefereces ({start}-{end})'):
        # 설정한 구간에 대한 해당 유저의 로그
        history = filter_read_by_time(user_time_read[user_id], start, end) 
        history = squeeze(list(map(lambda x: x[-1], history)))

        # 유저 로그로부터 TF-IDF 행렬 생성
        user_tfidf = tfidf_generator.generate(post_encoder.transform(history), drop_id=False) # 

        # TF-IDF 행렬로부터 유저 feature 벡터를 생성
        preference = sparse.csr_matrix(user_tfidf.iloc[:, 1:].values.sum(axis=0)[:, np.newaxis]) # post_meta_id 컬럼을 제외한 뒤 summation
        user_tfidf = user_tfidf.groupby('post_meta_id').first().reset_index() # faster than drop_duplicates()
        user_tfidf = user_tfidf.loc[~user_tfidf['post_meta_id'].isin(post_meta_id), :]
        if len(user_tfidf) > 0:
            post_meta_id.extend(user_tfidf['post_meta_id'].tolist())
        posts_raw.append(sparse.csr_matrix(user_tfidf.iloc[:, 1:])) # post_meta_id 컬럼을 제외하고 append -> post_meta_id 리스트를 개별적으로 생성하므로 불필요
        user_preferences_raw.append(preference)
    
    print('Postprocessing...')
    posts = sparse.vstack(posts_raw)
    user_preferences = sparse.hstack(user_preferences_raw)
    idf = np.log(tfidf_generator.DF.values.squeeze()) - np.log((posts != 0).sum(axis=0) + 1e-4)
# recommend_output = (posts.multiply(idf)).dot(user_preferences)


Getting user prefereces (2018100100-2019022200):   0%|          | 0/200 [00:00<?, ?it/s][A
Getting user prefereces (2018100100-2019022200):   1%|          | 2/200 [00:00<00:12, 15.67it/s][A
Getting user prefereces (2018100100-2019022200):   2%|▎         | 5/200 [00:00<00:12, 15.67it/s][A
Getting user prefereces (2018100100-2019022200):   4%|▍         | 8/200 [00:00<00:11, 17.02it/s][A
Getting user prefereces (2018100100-2019022200):   5%|▌         | 10/200 [00:00<00:12, 14.81it/s][A
Getting user prefereces (2018100100-2019022200):   6%|▌         | 12/200 [00:01<00:19,  9.77it/s][A
Getting user prefereces (2018100100-2019022200):   7%|▋         | 14/200 [00:02<00:42,  4.38it/s][A
Getting user prefereces (2018100100-2019022200):   8%|▊         | 16/200 [00:02<00:38,  4.76it/s][A
Getting user prefereces (2018100100-2019022200):  10%|▉         | 19/200 [00:02<00:34,  5.22it/s][A
Getting user prefereces (2018100100-2019022200):  11%|█         | 22/200 [00:02<00:25,  6.89it/s][A
G

In [None]:
# Without Split

def filter_read_by_time(history, start, end):
    history_filtered = list(filter(lambda x: (int(start) <= int(x[0].split('_')[0])) and (int(end) >= int(x[0].split('_')[-1])), history))
    return history_filtered

start = Config.train_start
end = Config.train_end
post_meta_id = []

posts_raw = [] # 유저들의 로그에서 등장한 모든 글을 담을 리스트
user_preferences_raw = [] # 유저들의 feature 벡터를 담을 리스트

for user_id in tqdm(dev, desc=f'Getting user prefereces ({start}-{end})'):
    # 설정한 구간에 대한 해당 유저의 로그
    history = filter_read_by_time(user_time_read[user_id], start, end) 
    history = squeeze(list(map(lambda x: x[-1], history)))

    # 유저 로그로부터 TF-IDF 행렬 생성
    user_tfidf = tfidf_generator.generate(post_encoder.transform(history), drop_id=False) # 

    # TF-IDF 행렬로부터 유저 feature 벡터를 생성
    preference = sparse.csr_matrix(user_tfidf.iloc[:, 1:].values.sum(axis=0)[:, np.newaxis]) # post_meta_id 컬럼을 제외한 뒤 summation
    user_tfidf = user_tfidf.groupby('post_meta_id').first().reset_index() # faster than drop_duplicates()
    user_tfidf = user_tfidf.loc[~user_tfidf['post_meta_id'].isin(post_meta_id), :]
    if len(user_tfidf) > 0:
        post_meta_id.extend(user_tfidf['post_meta_id'].tolist())
    posts_raw.append(sparse.csr_matrix(user_tfidf.iloc[:, 1:])) # post_meta_id 컬럼을 제외하고 append -> post_meta_id 리스트를 개별적으로 생성하므로 불필요
    user_preferences_raw.append(preference)

print('Postprocessing...')
posts = sparse.vstack(posts_raw)
user_preferences = sparse.hstack(user_preferences_raw)
idf = np.log(tfidf_generator.DF.values.squeeze()) - np.log((posts != 0).sum(axis=0) + 1e-4)
# recommend_output = (posts.multiply(idf)).dot(user_preferences)

In [10]:
def filter_read_by_time(history, start, end):
    history_filtered = list(filter(lambda x: (int(start) <= int(x[0].split('_')[0])) and (int(end) >= int(x[0].split('_')[-1])), history))
    return history_filtered

start = Config.train_start
end = Config.train_end
n_splits = 25
batch_size = len(dev) // n_splits
seen_post_id = []

for i in range(n_splits):
    dev_user_batch = dev[i*batch_size:] if i == n_splits-1 else dev[i*batch_size:(i+1)*batch_size]
    posts_raw = [] # 유저들의 로그에서 등장한 모든 글을 담을 리스트
    user_preferences_raw = [] # 유저들의 feature 벡터를 담을 리스트

    for user_id in tqdm(dev_user_batch, desc=f'Getting user prefereces ({start}-{end})'):
        # 설정한 구간에 대한 해당 유저의 로그
        history = filter_read_by_time(user_time_read[user_id], start, end) 
        history = squeeze(list(map(lambda x: x[-1], history)))

        # 유저 로그로부터 TF-IDF 행렬 생성
        user_tfidf = tfidf_generator.generate(post_encoder.transform(history), drop_id=False) # 

        # TF-IDF 행렬로부터 유저 feature 벡터를 생성
        preference = user_tfidf.drop('post_meta_id', axis=1).values.sum(axis=0)
        user_tfidf = user_tfidf.loc[~user_tfidf['post_meta_id'].isin(seen_post_id), :]
        seen_post_id.extend(user_tfidf['post_meta_id'].tolist())

        posts_raw.append(sparse.csr_matrix(user_tfidf))
        user_preferences_raw.append(preference[:, np.newaxis])
    
    print('Postprocessing...')
    # posts = pd.concat(posts_raw, axis=0, ignore_index=True).drop_duplicates(ignore_index=True) # 중복된 글은 제거
    posts = sparse.vstack(posts_raw)
    # post_meta_id = posts['post_meta_id'].tolist()
    post_meta_id = posts[:,0].data.tolist()
    posts = posts[:, 1:]
    # posts = posts.drop('post_meta_id', axis=1)
    user_preferences = np.hstack(user_preferences_raw)
    # idf = np.log(tfidf_generator.DF.values.squeeze()) - np.log((posts != 0).sum().values + 1e-4)
    idf = np.log(tfidf_generator.DF.values.squeeze()) - np.log((posts != 0).sum(axis=0) + 1e-4)
    break

Extracting user prefereces based on 2018100100-2019022200: 0it [00:00, ?it/s]Getting User Preferences of Batch #0...
Postprocessing...



KeyError: 'post_meta_id'

In [6]:
def filter_read_by_time(history, start, end):
    history_filtered = list(filter(lambda x: (int(start) <= int(x[0].split('_')[0])) and (int(end) >= int(x[0].split('_')[-1])), history))
    return history_filtered

def calculate_export_user_preferences(start: str=Config.train_start, end: str=Config.train_end, n_splits: int=50, save_path: str='./offline_tasks/user_preferences'):
    print('Loading tools...', end='\t')
    dev_raw = [open('./preprocessed/train', 'r').readlines()]
    dev_user_list = []
    for daily in dev_raw:
        users = [u.split()[0] for u in daily]
        dev_user_list.extend(users)

    user_time_read = load_user_time_read(root_dir='./preprocessed/user_time_read.json')
    encoder = PostIdEncoder(root_dir='./encodings')
    tfidf = TFIDFGenerator('./tfidf')
    print('loaded!')

    batch_size = len(dev_user_list) // n_splits

    for i in range(n_splits):
        print(f'Getting User Preferences of Batch #{i}...')
        if i == n_splits-1:
            dev_user_batch = dev_user_list[i*batch_size:]
        else:
            dev_user_batch = dev_user_list[i*batch_size:(i+1)*batch_size]

        posts = pd.DataFrame()
        user_preferences = np.zeros((7000, 1))

        for user_id in tqdm(dev_user_batch, desc=f'Extracting user prefereces based on {start}-{end}'):
            history = filter_read_by_time(user_time_read[user_id], start, end)
            preference = np.zeros((1, 7000))
            for h in history:
                partial_tfidf = tfidf.generate(encoder.transform(h[-1]), drop_id=False)
                preference += partial_tfidf.drop('post_meta_id', axis=1).values.sum(axis=0)
                posts = pd.concat([posts, partial_tfidf], axis=0, ignore_index=True)
            user_preferences = np.hstack([user_preferences, preference.reshape(7000, 1)])

        print('Postprocessing...')
        user_preferences = user_preferences[:, 1:]
        posts = posts.drop_duplicates(ignore_index=True)
        post_meta_id = posts['post_meta_id'].tolist()
        posts = posts.drop('post_meta_id', axis=1)
        idf = np.log(tfidf.DF.values.squeeze()) - np.log((posts != 0).sum().values + 1e-4)

        print('Saving...')
        batch_name = f'({start}-{end})batch{i+1:0>2d}'
        os.mkdir(os.path.join(save_path, batch_name))
        save_npz(os.path.join(save_path, batch_name, f'posts{i+1:0>2d}.npz'), csr_matrix(posts.values))
        save_as_pickle(post_meta_id, os.path.join(save_path, batch_name, f'post_meta_id{i:0>2d}.pkl'))
        np.save(os.path.join(save_path, batch_name, f'idf{i+1:0>2d}.npy'), idf)
        np.save(os.path.join(save_path, batch_name, f'user_preferences{i+1:0>2d}.npy'), user_preferences)

Extracting user prefereces based on 2018100100-2019022200: 100%|██████████| 100/100 [46:26<00:00, 27.87s/it]
Postprocessing...


In [None]:
os.

In [7]:
recommend_output = np.matmul((posts * idf).values, user_preferences)
enc = PostIdEncoder(root_dir='../encodings')

In [10]:
np.save('test_recommend.npy', recommend_output)

In [8]:
recommend_output_user = pd.DataFrame(dict(value=recommend_output[:, 1], id=post_meta_id)).sort_values(by='value', ascending=False)

In [52]:
recommend_output_user['id'] = recommend_output_user['id'].apply(lambda x: enc.inverse_transform(x))

In [53]:
recommend_output_user.head(100)

Unnamed: 0,value,id
39,322457.075150,@englishspeaking_66
56,313143.167282,@linecard_45
60,313143.167282,@linecard_43
57,313143.167282,@linecard_42
98,312260.888493,@linecard_77
...,...,...
103,188480.532344,@hygo92_85
82,184787.498120,@muncoach_25
206,184734.588007,@thepiano_114
131,184527.611553,@thepiano_106
