# 모델 기반 협업 필터링

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm import tqdm

데이터 살펴보기

In [2]:
df = pd.read_csv('events.csv')

In [3]:
df

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [4]:
print('Num of unique visitors:', df['visitorid'].nunique())
print('Num of unique items:', df['itemid'].nunique())

Num of unique visitors: 1407580
Num of unique items: 235061


한 사람이 여러 방문자로 찍혔는지는 알 수 없다<br>
모든 visitorid는 독립적인 방문자로 가정한다

이벤트의 종류

In [5]:
df['event'].unique()

array(['view', 'addtocart', 'transaction'], dtype=object)

한번이라도 물건을 구입한 방문자의 수

In [6]:
df[df['transactionid'].notnull()]['visitorid'].nunique()

11719

구매 기록만 모은다

In [7]:
transaction_df = df[df['transactionid'].notnull()]
transaction_df

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
130,1433222276276,599528,transaction,356475,4000.0
304,1433193500981,121688,transaction,15335,11117.0
418,1433193915008,552148,transaction,81345,5444.0
814,1433176736375,102019,transaction,150318,13556.0
843,1433174518180,189384,transaction,310791,7244.0
...,...,...,...,...,...
2755294,1438377176570,1050575,transaction,31640,8354.0
2755349,1438379878779,861299,transaction,456602,3643.0
2755508,1438357730123,855941,transaction,235771,4385.0
2755603,1438355560300,548772,transaction,29167,13872.0


이 실습에서는 구매 여부만 예측한다<br>
따라서 중복 구매 기록은 지운다

In [8]:
deduplicated_df = transaction_df.drop_duplicates(subset=['visitorid', 'itemid']).copy()
deduplicated_df

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
130,1433222276276,599528,transaction,356475,4000.0
304,1433193500981,121688,transaction,15335,11117.0
418,1433193915008,552148,transaction,81345,5444.0
814,1433176736375,102019,transaction,150318,13556.0
843,1433174518180,189384,transaction,310791,7244.0
...,...,...,...,...,...
2755294,1438377176570,1050575,transaction,31640,8354.0
2755349,1438379878779,861299,transaction,456602,3643.0
2755508,1438357730123,855941,transaction,235771,4385.0
2755603,1438355560300,548772,transaction,29167,13872.0


구매를 일정 수준 이하로 한 방문자는 필터링한다

In [9]:
min_visitor_transactions = 1
filter_visitors = deduplicated_df['visitorid'].value_counts() > min_visitor_transactions
filter_visitors = filter_visitors[filter_visitors].index.tolist()

구매를 일정 수준 이하로 받은 상품은 필터링한다

In [10]:
min_item_transactions = 1
filter_items = deduplicated_df['itemid'].value_counts() > min_item_transactions
filter_items = filter_items[filter_items].index.tolist()

In [11]:
df_purchased = deduplicated_df[lambda x: (x['visitorid'].isin(filter_visitors)) & (x['itemid'].isin(filter_items))][['visitorid', 'itemid']]
df_purchased

Unnamed: 0,visitorid,itemid
814,102019,150318
1215,350566,284871
1234,404403,150100
2315,911093,277119
2316,911093,251130
...,...,...
2753661,1150086,360825
2755156,861299,22969
2755349,861299,456602
2755603,548772,29167


필터링 전 구매 기록 총 수

In [12]:
len(deduplicated_df)

21270

필터링 후 구매 기록 총 수

In [13]:
len(df_purchased)

6968

구매하지 않은 경우의 데이터를 생성한다 (네거티브 샘플)

In [14]:
not_purchased_data = {
    'visitorid': [],
    'itemid': []
}

random.seed(0)

while len(not_purchased_data['visitorid']) < len(df_purchased):
    random_visitor = random.choice(filter_visitors)
    random_item = random.choice(filter_items)
    random_purchased = df_purchased[lambda x: (x['visitorid'] == random_visitor) & (x['itemid'] == random_item)]
    
    if len(random_purchased) > 0:
        continue
    
    not_purchased_data['visitorid'].append(random_visitor)
    not_purchased_data['itemid'].append(random_item)

In [15]:
df_not_purchased = pd.DataFrame(not_purchased_data)
df_not_purchased

Unnamed: 0,visitorid,itemid
0,118450,396281
1,128727,178274
2,459290,282820
3,587126,142510
4,937463,121835
...,...,...
6963,888537,381265
6964,1062987,200180
6965,815905,193488
6966,468466,391728


구매한 경우 1, 구매하지 않은 경우 0인 purchased 컬럼을 만든다.

In [16]:
df_balanced = pd.concat([df_purchased.assign(purchased=1), df_not_purchased.assign(purchased=0)])
df_balanced

Unnamed: 0,visitorid,itemid,purchased
814,102019,150318,1
1215,350566,284871,1
1234,404403,150100,1
2315,911093,277119,1
2316,911093,251130,1
...,...,...,...
6963,888537,381265,0
6964,1062987,200180,0
6965,815905,193488,0
6966,468466,391728,0


학습 데이터와 테스트 데이터로 나눈다

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_train, df_test = train_test_split(df_balanced, random_state=0)

In [19]:
df_train.shape, df_test.shape

((10452, 3), (3484, 3))

구매 데이터를 피봇해서 (방문자 수 X 상품 수)의 행렬로 만든다.<br>
원소의 값은 구매했는지 여부가 된다.

In [20]:
df_p = pd.pivot_table(df_train, index='visitorid', columns='itemid', values='purchased')
df_p

itemid,25,496,546,829,869,1022,1152,1255,1261,1377,...,465616,465751,465833,465951,466008,466109,466114,466135,466319,466614
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
172,,,,,,,,,,,...,,,,,,,,,,
264,,,,,,,,,,,...,,,,,,,,,,
2019,,,,,,,,,,,...,,,,,,,,,,
3104,,,,,,,,,,,...,,,,,,,,,,
3258,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402220,,,,,,,,,,,...,,,,,,,,,,
1403769,,,,,,,,,,,...,,,,,,,,,,
1404265,,,,,,,,,,,...,,,,,,,,,,
1404991,,,,,,,,,,,...,,,,,,,,,,


In [21]:
df_p.sum().sum()

5248.0

2129 rows × 3688 columns에 5248개의 데이터가 있다

케라스를 불러온다

In [22]:
from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout
from keras.models import Model
from tensorflow.keras import optimizers
from tensorflow.keras import activations
from sklearn.metrics import mean_squared_error

visitorid와 itemid는 연속적이지 않다<br>
인덱스로 쓰기 위해 매핑을 만든다

In [23]:
visitor_id_mapping = {visitorid: i for i, visitorid in enumerate(df_balanced['visitorid'].unique())}
item_id_mapping = {itemid: i for i, itemid in enumerate(df_balanced['itemid'].unique())}

In [24]:
[(k, v) for k, v in visitor_id_mapping.items() if v <= 10]

[(102019, 0),
 (350566, 1),
 (404403, 2),
 (911093, 3),
 (273406, 4),
 (1233140, 5),
 (1161163, 6),
 (189384, 7),
 (286616, 8),
 (1235292, 9),
 (1236753, 10)]

학습 데이터와 테스트 데이터를 인덱스로 매핑한다

In [25]:
train_visitor_data = df_train['visitorid'].map(visitor_id_mapping)
train_item_data = df_train['itemid'].map(item_id_mapping)

test_visitor_data = df_test['visitorid'].map(visitor_id_mapping)
test_item_data = df_test['itemid'].map(item_id_mapping)

사이즈를 구한다

In [26]:
num_visitors = len(visitor_id_mapping)
num_items = len(item_id_mapping)

print('num_visitors', num_visitors)
print('num_items', num_items)

num_visitors 2165
num_items 3854


## 모델 만들기

입력 레이어를 만든다

In [27]:
visitor_id_input = Input(shape=[1], name='visitor')
item_id_input = Input(shape=[1], name='item')
visitor_id_input, item_id_input

(<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'visitor')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'item')>)

임베딩 레이어를 만든다

In [28]:
embedding_size = 10

In [29]:
visitor_embedding = Embedding(
    output_dim=embedding_size,
    input_dim=num_visitors,
    input_length=1,
    name='user_embedding'
)(visitor_id_input)

item_embedding = Embedding(
    output_dim=embedding_size,
    input_dim=num_items,
    input_length=1,
    name='item_embedding'
)(item_id_input)

visitor_embedding, item_embedding

(<KerasTensor: shape=(None, 1, 10) dtype=float32 (created by layer 'user_embedding')>,
 <KerasTensor: shape=(None, 1, 10) dtype=float32 (created by layer 'item_embedding')>)

임베딩 레이어의 차원을 바꾼다

In [30]:
user_vector = Reshape([embedding_size])(visitor_embedding)
item_vector = Reshape([embedding_size])(item_embedding)

user_vector, item_vector

(<KerasTensor: shape=(None, 10) dtype=float32 (created by layer 'reshape')>,
 <KerasTensor: shape=(None, 10) dtype=float32 (created by layer 'reshape_1')>)

차원을 조정한 임베딩 레이어 사이를 내적한다

In [31]:
y = Dot(1, normalize=False)([user_vector, item_vector])
y

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dot')>

모델을 구성한다

In [32]:
model = Model(inputs=[visitor_id_input, item_id_input], outputs=y)
opt = optimizers.Adam(learning_rate=0.01)
model.compile(loss='mse', optimizer=opt)

In [33]:
model

<tensorflow.python.keras.engine.functional.Functional at 0x1c9aee105b0>

모델을 학습시킨다

In [34]:
model.fit(
    [train_visitor_data, train_item_data],
    df_train['purchased'],
    batch_size=1024,
    epochs=10,
    validation_split=0.01,
    shuffle=True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c9c1d629a0>

모델 테스트하고 RMSE 계산하기

In [36]:
y_pred = model.predict([test_visitor_data, test_item_data])
y_true = df_test['purchased'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('RMSE: {:4f}'.format(rmse))

RMSE: 0.698127


# 행렬 분해 기법 확장하기

모델 구조를 데이터셋의 특징에 맞추면 정확도를 더 올릴 수 있다

구하고자 하는 모델은 0 또는 1인데, 현재 모델은 -inf ~ inf의 숫자이고, loss는 mse이므로 학습이 잘 안된다

In [37]:
y

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'dot')>

In [38]:
binary_y = activations.sigmoid(y)

In [39]:
binary_y

<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'tf.math.sigmoid')>

모델을 구성한다

In [40]:
model = Model(inputs=[visitor_id_input, item_id_input], outputs=binary_y)
opt = optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=opt)

모델을 학습시킨다

In [41]:
model.fit(
    [train_visitor_data, train_item_data],
    df_train['purchased'],
    batch_size=1024,
    epochs=10,
    validation_split=0.01,
    shuffle=True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1c9c37d2070>

모델 테스트하고 RMSE 계산하기

In [42]:
y_pred = model.predict([test_visitor_data, test_item_data])
y_true = df_test['purchased'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('RMSE: {:4f}'.format(rmse))

RMSE: 0.512550
