In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
train_df = pd.read_pickle('../data/train.pkl')
test_df = pd.read_pickle('../data/test.pkl')
movies = pd.read_csv('../data/movie.csv')

### CB features

In [3]:
movies['year'] = movies.title.str.rstrip('" ').str[-5:-1]

years = []
# 좋지 않은 코딩
for i in movies.year: 
    try:
        years.append(int(i))
    except:
        years.append(None)

movies['years'] = years

movies['ages'] = (int(train_df.timestamp.max()[:4]) - movies['years']).fillna(-1).astype(int)

In [4]:
movie_genre = movies.set_index('movieId')
movie_genre = movie_genre.loc[movie_genre.genres != '(no genres listed)', 'genres'].apply(lambda row: row.split("|")).explode().reset_index()
movie_genre = movie_genre.assign(value=1).pivot(index='movieId', columns='genres', values='value').fillna(0).astype(np.int8)

In [5]:
CB_df = movies[['movieId', 'ages']].merge(movie_genre.reset_index(), on ='movieId', how='left')

In [6]:
for col in CB_df.columns.drop(['movieId', 'ages']):
    CB_df[col] = CB_df[col].fillna(0).astype(np.int8)

### Load CF features

In [7]:
user_factor_df = pd.read_pickle("../data/user_factor.pkl")
item_factor_df = pd.read_pickle("../data/item_factor.pkl")

In [8]:
user_factor_df.columns = [f"user_{col}" for col in user_factor_df.columns]
user_factor_df = user_factor_df.reset_index().rename(columns={'index':'userId'})

In [9]:
item_factor_df.columns = [f"item_{col}" for col in item_factor_df.columns]
item_factor_df = item_factor_df.reset_index().rename(columns={'index':'movieId'})

### Hybrid

- CB features: year, genre
- CF features: ALS로부터 뽑은 embedding vector

- $y_{i, j}$ = i번 유저가, j번 영화를 볼 확률(classfication)  
- $X_{i, j}$ = i번 유저의 20차원 embedding feature, j번 영화의 embedding feature, year, genre

**Negative sampling**
- y=1인 케이스에 대비하여, y=0인 케이스로 학습할 관측치를 뽑는 것
- y=1인 경우: 인기가 많은 작품은 자주 등장하고, 인기가 적은 작품은 조금 등장할 것
- y=0인 경우: 전체 아이템에 대해 같은 확률로 샘플링하게되면 -> 인기와 무관하게 등장빈도가 동일하겠죠
  - 전체 아이템의 인기도를 고려하여 샘플을 해야함 -> 셔플 후 위부터 사용

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18058339 entries, 37 to 19999980
Data columns (total 4 columns):
userId       int32
movieId      int32
rating       int16
timestamp    object
dtypes: int16(1), int32(2), object(1)
memory usage: 447.8+ MB


In [11]:
train_df.shape

(18058339, 4)

In [12]:
18058339*4

72233356

In [13]:
res = []
negative_df = train_df.drop(['timestamp', 'userId'], axis=1).sample(frac=1)

for userId, positive_samples in tqdm(train_df.drop('timestamp', axis=1).groupby('userId')):
    n = len(positive_samples)
    if len(negative_df) > 3*n:
        res.append(pd.concat([positive_samples.assign(y=1), negative_df[:n*3].assign(userId=userId, y=0)], sort=False).drop_duplicates('movieId'))
        negative_df = negative_df[n*3:]
    else:
        negative_df = train_df.drop(['timestamp', 'userId'], axis=1).sample(frac=1)

100%|██████████| 138493/138493 [10:21<00:00, 222.72it/s]


### Merge CF / CB features

In [14]:
df = pd.concat(res)

In [15]:
df_merged = df.merge(CB_df, on='movieId')
assert len(df) == len(df_merged)
df = df_merged

In [16]:
df_merged = df.merge(item_factor_df, on='movieId', how='left')
assert len(df) == len(df_merged)
df = df_merged

In [17]:
df_merged = df.merge(user_factor_df, on='userId', how='left')
assert len(df) == len(df_merged)
df = df_merged

In [26]:
df.userId = df.userId.astype(np.int32)
df.rating = df.rating.astype(np.int8)
df.y = df.y.astype(np.int8)
df.ages = df.ages.astype(np.int16)

In [28]:
df.to_pickle('../data/feature_df.pkl')

In [38]:
df.shape

(54053430, 64)

### Merge to test

In [34]:
candidate = pd.read_pickle('../data/als_candidate_150.pkl')

candidate = candidate.explode().reset_index()
candidate.columns = ['userId', 'movieId']

In [35]:
candidate_merged = candidate.merge(CB_df, on='movieId')
assert len(candidate) == len(candidate_merged)
candidate = candidate_merged

candidate_merged = candidate.merge(item_factor_df, on='movieId', how='left')
assert len(candidate) == len(candidate_merged)
candidate = candidate_merged

candidate_merged = candidate.merge(user_factor_df, on='userId', how='left')
assert len(candidate) == len(candidate_merged)
candidate = candidate_merged

In [37]:
candidate.to_pickle("../data/als_candidate_merged.pkl")