# Neural Collaborative Filtering

    - recommendation system에서 쓰이던 Matrix Factorization 의 한계점( 기존의 MF가 linear하고 fixed 해서,
    user-item의 complexity한 관계를 표현하지 못함)을 보완해서 MLP를 앙상블해서 만듦
    - MF에 non-linear activation function을 더해 표현력을 높힌 Generalized Matrix Factorization(GMF)와
    user와 item을 받아 유저가 다른 item을 선호할지에 대해 예측하는 Multi-Layter-Perceptron (MLP) 사용

## (1) Data Load

    - 음악 추천 데이터 (Music Recommendation Datasets for Research) 360K users 사용
    
http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
os.listdir('./data/lastfm-dataset-360K/lastfm-dataset-360K/')

['mbox_sha1sum.py',
 'README.txt',
 'usersha1-artmbid-artname-plays.tsv',
 'usersha1-profile.tsv']

In [3]:
base_path = './data/lastfm-dataset-360K/lastfm-dataset-360K/'

In [4]:
d1 = pd.read_csv(base_path+'usersha1-artmbid-artname-plays.tsv', delimiter='\t', header=None)
d1

Unnamed: 0,0,1,2,3
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
...,...,...,...,...
17535650,"sep 20, 2008",7ffd711a-b34d-4739-8aab-25e045c246da,turbostaat,12
17535651,"sep 20, 2008",9201190d-409f-426b-9339-9bd7492443e2,cuba missouri,11
17535652,"sep 20, 2008",e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,little man tate,11
17535653,"sep 20, 2008",f6f2326f-6b25-4170-b89d-e235b25508e8,sigur rós,10


In [5]:
d2 = pd.read_csv(base_path+'usersha1-profile.tsv', delimiter='\t', header=None)
d2

Unnamed: 0,0,1,2,3,4
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007"
1,00001411dc427966b17297bf4d69e7e193135d89,f,,Canada,"Dec 4, 2007"
2,00004d2ac9316e22dc007ab2243d6fcb239e707d,,,Germany,"Sep 1, 2006"
3,000063d3fe1cf2ba248b9e3c3f0334845a27a6bf,m,19.0,Mexico,"Apr 28, 2008"
4,00007a47085b9aab8af55f52ec8846ac479ac4fe,m,28.0,United States,"Jan 27, 2006"
...,...,...,...,...,...
359342,fffe7823f67b433b45f22056467db921c1d3d7d0,m,25.0,Germany,"Jun 24, 2006"
359343,fffe8637bd8234309e871409c7ebef99a720afc1,m,25.0,Brazil,"Sep 9, 2007"
359344,fffe8c7f952d9b960a56ed4dcb40a415d924b224,m,20.0,United States,"Aug 8, 2007"
359345,ffff9af9ae04d263dae91cb838b1f3a6725f5ffb,m,20.0,Russian Federation,"Dec 3, 2005"


In [6]:
df = d1.drop(d1.columns[2] , axis=1)
df.columns = ['user','item','plays']
df = df.dropna()
df = df.loc[df['plays']!=0]
df

Unnamed: 0,user,item,plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,706
...,...,...,...
17535650,"sep 20, 2008",7ffd711a-b34d-4739-8aab-25e045c246da,12
17535651,"sep 20, 2008",9201190d-409f-426b-9339-9bd7492443e2,11
17535652,"sep 20, 2008",e7cf7ff9-ed2f-4315-aca8-bcbd3b2bfa71,11
17535653,"sep 20, 2008",f6f2326f-6b25-4170-b89d-e235b25508e8,10


In [7]:
df.shape

(17309518, 3)

In [8]:
print(f" user -> {len(pd.unique(df.user))}")
print(f" item - > {len(pd.unique(df.item))}")

 user -> 358858
 item - > 160112


## (2) user 10,000 random sampling

In [9]:
test = df.copy()

unique_user_lst = list(np.unique(df['user']))
sample_user_idx = np.random.choice(len(unique_user_lst), 10000, replace=False) 
sample_user_lst = [unique_user_lst[u_idx] for u_idx in sample_user_idx]

test = df[df['user'].isin(sample_user_lst)]
test = test.reset_index(drop=True)

test

Unnamed: 0,user,item,plays
0,0002fa6e25794597126fa44529105cedb31a75aa,b755836a-0cb6-4577-a678-55a5a6402d70,8
1,0002fa6e25794597126fa44529105cedb31a75aa,d78def22-ed9c-44c0-89c7-c198a4c15e05,6
2,0002fa6e25794597126fa44529105cedb31a75aa,7b2f87f6-db90-464e-a27a-deb4f7219e90,6
3,0002fa6e25794597126fa44529105cedb31a75aa,d8354b38-e942-4c89-ba93-29323432abc3,5
4,0002fa6e25794597126fa44529105cedb31a75aa,ba147ea0-60c4-4a40-9da6-0d5c2b2e7b03,5
...,...,...,...
483080,fffa4c6989304128f53a1643c0515277435f81dd,83d91898-7763-47d7-b03b-b92132375c47,48
483081,fffa4c6989304128f53a1643c0515277435f81dd,f9114439-1662-4415-b761-05a4170c9579,46
483082,fffa4c6989304128f53a1643c0515277435f81dd,6faa7ca7-0d99-4a5e-bfa6-1fd5037520c6,46
483083,fffa4c6989304128f53a1643c0515277435f81dd,0d812ef2-8697-4331-ac60-6dfe7c8fa7aa,45


In [10]:
test.groupby('user').count()

Unnamed: 0_level_0,item,plays
user,Unnamed: 1_level_1,Unnamed: 2_level_1
0002fa6e25794597126fa44529105cedb31a75aa,47,47
0003906ab668111f2cd332962cb09f8e3b795c6c,49,49
001081f0dfe6295f20f1746a448076d71be091e2,48,48
00141206c6d82c1f53fe895b99064318be656ee0,60,60
0017830996628c477a0b86a71e08072592eaa0e0,44,44
...,...,...
ffe17c82b915f6a2b6ed261414102d6a0e42bad6,49,49
ffe3c38f9ac58d35521256159e5d63577a192d6f,44,44
ffed2024f670da276bc519c54e13cc78af927c29,47,47
fff9dc65e7f2763a7e8bce8d99cc1491c2ae4c6f,45,45


In [11]:
test['count'] = test.groupby('user')['user'].transform('count')
test = test[test['count']>1]
test

Unnamed: 0,user,item,plays,count
0,0002fa6e25794597126fa44529105cedb31a75aa,b755836a-0cb6-4577-a678-55a5a6402d70,8,47
1,0002fa6e25794597126fa44529105cedb31a75aa,d78def22-ed9c-44c0-89c7-c198a4c15e05,6,47
2,0002fa6e25794597126fa44529105cedb31a75aa,7b2f87f6-db90-464e-a27a-deb4f7219e90,6,47
3,0002fa6e25794597126fa44529105cedb31a75aa,d8354b38-e942-4c89-ba93-29323432abc3,5,47
4,0002fa6e25794597126fa44529105cedb31a75aa,ba147ea0-60c4-4a40-9da6-0d5c2b2e7b03,5,47
...,...,...,...,...
483080,fffa4c6989304128f53a1643c0515277435f81dd,83d91898-7763-47d7-b03b-b92132375c47,48,53
483081,fffa4c6989304128f53a1643c0515277435f81dd,f9114439-1662-4415-b761-05a4170c9579,46,53
483082,fffa4c6989304128f53a1643c0515277435f81dd,6faa7ca7-0d99-4a5e-bfa6-1fd5037520c6,46,53
483083,fffa4c6989304128f53a1643c0515277435f81dd,0d812ef2-8697-4331-ac60-6dfe7c8fa7aa,45,53


In [12]:
test['user_id'] = test['user'].astype('category').cat.codes
test['item_id'] = test['item'].astype('category').cat.codes

In [13]:
test

Unnamed: 0,user,item,plays,count,user_id,item_id
0,0002fa6e25794597126fa44529105cedb31a75aa,b755836a-0cb6-4577-a678-55a5a6402d70,8,47,0,35136
1,0002fa6e25794597126fa44529105cedb31a75aa,d78def22-ed9c-44c0-89c7-c198a4c15e05,6,47,0,41362
2,0002fa6e25794597126fa44529105cedb31a75aa,7b2f87f6-db90-464e-a27a-deb4f7219e90,6,47,0,23530
3,0002fa6e25794597126fa44529105cedb31a75aa,d8354b38-e942-4c89-ba93-29323432abc3,5,47,0,41487
4,0002fa6e25794597126fa44529105cedb31a75aa,ba147ea0-60c4-4a40-9da6-0d5c2b2e7b03,5,47,0,35666
...,...,...,...,...,...,...
483080,fffa4c6989304128f53a1643c0515277435f81dd,83d91898-7763-47d7-b03b-b92132375c47,48,53,9999,25254
483081,fffa4c6989304128f53a1643c0515277435f81dd,f9114439-1662-4415-b761-05a4170c9579,46,53,9999,47694
483082,fffa4c6989304128f53a1643c0515277435f81dd,6faa7ca7-0d99-4a5e-bfa6-1fd5037520c6,46,53,9999,21357
483083,fffa4c6989304128f53a1643c0515277435f81dd,0d812ef2-8697-4331-ac60-6dfe7c8fa7aa,45,53,9999,2594


## (3) train/test data & negative_data 생성

In [42]:
def train_test_split(df):
    """
    train, test data로 나눔"""
    
    df_train = df.copy(deep=True)
    df_test = df.copy(deep=True)
    
    # df_train
    mask = df.groupby(['user_id'])['user_id'].transform(mask_first).astype(bool)
    df_train = df.loc[mask]
    
    #df_test
    df_test = df_test.groupby(['user_id']).first()
    df_test['user_id'] = df_test.index
    df_test = df_test[['user_id','item_id','plays']]
    df_test = df_test.reset_index(drop=True)
    
    return df_train, df_test
    

def mask_first(x):
    result = np.ones_like(x)
    result[0] = 0
    return result

In [43]:
def get_negative(uids, iids, items, df_test):
    """
    negative item 생성
    """
    negativeList = []
    test_u = df_test['user_id'].values.tolist()
    test_i = df_test['item_id'].values.tolist()
    
    test_ratings = list(zip(test_u, test_i))
    zipped = set(zip(uids, iids))
    
    for (u,i) in test_ratings:
        negatives =[]
        negatives.append((u,i))
        for t in range(100):
            j = np.random.randint(len(items)) # negative item 1개 샘플링
            while (u,j) in zipped: # j가 train에 있으면 다시 뽑고 없으면 선택
                j = np.random.randint(len(items))
            negatives.append(j)
        negativeList.append(negatives)  # [(0,pos), neg, neg, ... ]
        
    df_neg = pd.DataFrame(negativeList)
    
    return df_neg
    

In [44]:
def prepare_dataset(df):
    
    """
    data load
    
    uids : train user
    iid : train item
    users : total user
    items : total item
    df_train : train data
    df_test : test data
    
    """

    unique_user_lst = list(np.unique(df['user']))
    sample_user_idx = np.random.choice(len(unique_user_lst), 10000, replace=False) 
    sample_user_lst = [unique_user_lst[u_idx] for u_idx in sample_user_idx]

    df = df[df['user'].isin(sample_user_lst)]
    df = df.reset_index(drop=True)
    
    # 1곡이상 들은 user만 포함
    df_cnt = df.groupby(['user']).count()
    df['count'] = df.groupby('user')['user'].transform('count')
    df = df[df['count']>1]
    
    # user,item ID 부여
    df['user_id'] = df['user'].astype('category').cat.codes
    df['item_id'] = df['item'].astype('category').cat.codes
    
    # lookup table
    item_lookup = df[['item_id','item']].drop_duplicates()
    item_lookup['item_id'] = item_lookup.item_id.astype(str)
    
    # train, test data
    df = df[['user_id' ,'item_id', 'plays']]
    df_train, df_test = train_test_split(df)
    
    # all user, item 
    users = list(np.sort(df['user_id'].unique()))
    items = list(np.sort(df['item_id'].unique()))

    # train user/item
    rows = df_train['user_id'].astype(int)
    cols = df_train['item_id'].astype(int)
    values = list(df_train['plays'])
    
    uids = np.array(rows.tolist())
    iids = np.array(cols.tolist())

    # 각 user마다 negative item 생성
    df_neg = get_negative(uids, iids, items, df_test)
    
    return uids, iids, df_train, df_test, df_neg, users, items, item_lookup
    
    

In [45]:
def get_train_instance(uids, iids, num_neg, num_items):
    """
    model에 사용할 train 데이터 생성 함수
    """
    
    user_input, item_input, labels = [], [], []
    zipped = set(zip(uids, iids)) 
    
    for (u,i) in zip(uids, iids):
        
        #postive item
        user_input.append(u)
        item_input.append(i)
        labels.append(1)

        #negatvie item
        
        for t in range(num_neg):
            j = np.random.randint(num_items)
            while (u,j) in zipped:                 # u가 j를 이미 선택했으면 resampling
                j = np.random.randint(num_items)
                
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
            
    return user_input, item_input, labels

In [19]:
uids, iids, df_train, df_test, df_neg, users, items, item_lookup = prepare_dataset(df)

### `train data`

In [20]:
display(df_train)

Unnamed: 0,user_id,item_id,plays
1,0,9555,125
2,0,5083,115
3,0,45128,107
4,0,46060,85
5,0,26604,77
...,...,...,...
482173,9999,4741,151
482174,9999,8370,148
482175,9999,19709,128
482176,9999,38262,126


    - df_train의 각 row(user_id, item_id_ 당 negatvie item을 num_neg 개씩 랜덤으로 선택

In [21]:
user_input, item_input, labels = get_train_instance(uids, iids, num_neg=4, num_items=len(items))

In [22]:
print(uids[0], iids[0])
print(uids[1], iids[1])

0 9555
0 5083


In [23]:
for i, (user_id, item_id, label) in enumerate(zip(user_input[0:10], item_input[0:10], labels[0:10])):
    if i==0 or i==5:
        print(f"user_id, postive_item_id, label -> {(user_id, item_id, label)}")
    else:
        print(f"user_id, negative_item_id, label -> {(user_id, item_id, label)}")

user_id, postive_item_id, label -> (0, 9555, 1)
user_id, negative_item_id, label -> (0, 20568, 0)
user_id, negative_item_id, label -> (0, 18068, 0)
user_id, negative_item_id, label -> (0, 20529, 0)
user_id, negative_item_id, label -> (0, 10725, 0)
user_id, postive_item_id, label -> (0, 5083, 1)
user_id, negative_item_id, label -> (0, 26433, 0)
user_id, negative_item_id, label -> (0, 43965, 0)
user_id, negative_item_id, label -> (0, 15586, 0)
user_id, negative_item_id, label -> (0, 40697, 0)


### `test data` ###

In [24]:
display(df_test)

Unnamed: 0,user_id,item_id,plays
0,0,18566,134
1,1,17218,315
2,2,20064,1727
3,3,31201,517
4,4,1507,1745
...,...,...,...
9995,9995,3930,72
9996,9996,37794,2016
9997,9997,38503,323
9998,9998,10946,1836


### `df_neg` 데이터
   
       - 각 user별로 negative item(user가 플레이 하지 않은 item(artist)) 100개를 랜덤으로 선택
       - column 0 : df_test 데이터의 (user_id, item_id)
       - column 1~100 : negatvie item
       
       
    => df_neg 데이터는 모델 평가시 Top-k metric 계산시 사용
       

In [25]:
display(df_neg)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,"(0, 18566)",19862,31522,39754,32298,8049,6175,15896,12445,45317,...,15262,27356,37641,7557,33651,26940,48535,33334,33822,44936
1,"(1, 17218)",46089,40637,46059,23671,33626,28443,28860,23035,31056,...,42870,28722,48837,30445,36181,17873,13784,19295,12251,9181
2,"(2, 20064)",39083,7256,21149,46691,36778,14638,2116,6568,28213,...,37381,21613,43629,16219,37867,48796,37570,36511,31203,12263
3,"(3, 31201)",1482,10841,16650,44613,21871,20208,30299,20268,39559,...,24667,22175,7813,23954,43254,32134,9187,32105,44923,47325
4,"(4, 1507)",15756,17119,32699,5964,38759,7422,48047,46227,15799,...,19027,12027,31419,26470,14738,29389,23464,46463,5522,9107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,"(9995, 3930)",34751,17376,9541,32789,21904,2366,47634,15902,15164,...,13223,9567,18048,2030,8954,42675,16378,14636,47266,424
9996,"(9996, 37794)",42772,5081,48642,46326,38345,39369,19029,42648,13448,...,7682,2947,29340,32057,29349,17221,2043,21562,32346,6696
9997,"(9997, 38503)",44631,36357,31664,29780,8418,30403,27650,22999,37545,...,20742,47562,32641,37474,161,5396,41014,15516,46130,11242
9998,"(9998, 10946)",46444,8908,5509,26037,24206,36365,13277,15636,46888,...,37685,19078,22310,13807,8733,1325,11478,9481,3824,5717


In [26]:
class Loader():

    def __init__(self):
        pass

    def load_dataset(self):
        """
        데이터 로드 함수

        uids: train user
        iids: train item
        users: 전체 user
        items: 전체 item
        df_train
        df_test
        """
        # 데이터 로드
        file_path = './data/lastfm-dataset-360K/lastfm-dataset-360K/'
        df = pd.read_csv(file_path + '/usersha1-artmbid-artname-plays.tsv', delimiter='\t', header=None)
        df = df.drop(df.columns[2], axis=1)
        df.columns = ['user', 'item', 'plays']
        df = df.dropna()
        df = df.loc[df.plays != 0]

        # user 샘플링
        sample_num = 100000
        unique_user_lst = list(np.unique(df['user']))  # 358857명
        sample_user_idx = np.random.choice(len(unique_user_lst), sample_num, replace=False)
        sample_user_lst = [unique_user_lst[idx] for idx in sample_user_idx]
        df = df[df['user'].isin(sample_user_lst)]
        df = df.reset_index(drop=True)

        # 1명 이상의 artist 데이터가 있는 user 만 사용
        df_count = df.groupby(['user']).count()
        df['count'] = df.groupby('user')['user'].transform('count')
        df = df[df['count'] > 1]

        # user, item 아이디 부여
        df['user_id'] = df['user'].astype("category").cat.codes
        df['item_id'] = df['item'].astype("category").cat.codes

        # lookup 테이블 생성
        item_lookup = df[['item_id', 'item']].drop_duplicates()
        item_lookup['item_id'] = item_lookup.item_id.astype(str)

        # train, test 데이터 생성
        df = df[['user_id', 'item_id', 'plays']]
        df_train, df_test = self.train_test_split(df)

        # 전체 user, item 리스트 생성
        users = list(np.sort(df.user_id.unique()))
        items = list(np.sort(df.item_id.unique()))

        # train user, item 리스트 생성
        rows = df_train['user_id'].astype(int)
        cols = df_train['item_id'].astype(int)
        values = list(df_train.plays)

        uids = np.array(rows.tolist())
        iids = np.array(cols.tolist())

        # 각 user 마다 negative item 생성
        df_neg = self.get_negatives(uids, iids, items, df_test)

        return uids, iids, df_train, df_test, df_neg, users, items, item_lookup

    def get_negatives(self, uids, iids, items, df_test):
        """
        negative item 리스트 생성함수
        """
        negativeList = []
        test_u = df_test['user_id'].values.tolist()
        test_i = df_test['item_id'].values.tolist()

        test_ratings = list(zip(test_u, test_i))  # test (user, item)세트
        zipped = set(zip(uids, iids))             # train (user, item)세트

        for (u, i) in test_ratings:

            negatives = []
            negatives.append((u, i))
            for t in range(100):
                j = np.random.randint(len(items))     # neg_item j 1개 샘플링
                while (u, j) in zipped:               # j가 train에 있으면 다시뽑고, 없으면 선택
                    j = np.random.randint(len(items))
                negatives.append(j)
            negativeList.append(negatives) # [(0,pos), neg, neg, ...]

        df_neg = pd.DataFrame(negativeList)

        return df_neg

    def mask_first(self, x):

        result = np.ones_like(x)
        result[0] = 0  # [0,1,1,....]

        return result

    def train_test_split(self, df):
        """
        train, test 나누는 함수
        """
        df_test = df.copy(deep=True)
        df_train = df.copy(deep=True)

        # df_test
        # user_id와 holdout_item_id(user가 플레이한 아이템 중 1개)뽑기
        df_test = df_test.groupby(['user_id']).first()
        df_test['user_id'] = df_test.index
        df_test = df_test[['user_id', 'item_id', 'plays']]
        df_test = df_test.reset_index(drop=True)

        # df_train
        # user_id 리스트에 make_first()적용
        mask = df.groupby(['user_id'])['user_id'].transform(self.mask_first).astype(bool)
        df_train = df.loc[mask]

        return df_train, df_test

    def get_train_instances(self, uids, iids, num_neg, num_items):
        """
        모델에 사용할 train 데이터 생성 함수
        """
        user_input, item_input, labels = [],[],[]
        zipped = set(zip(uids, iids)) # train (user, item) 세트

        for (u, i) in zip(uids, iids):

            # pos item 추가
            user_input.append(u)  # [u]
            item_input.append(i)  # [pos_i]
            labels.append(1)      # [1]

            # neg item 추가
            for t in range(num_neg):

                j = np.random.randint(num_items)      # neg_item j num_neg 개 샘플링
                while (u, j) in zipped:               # u가 j를 이미 선택했다면
                    j = np.random.randint(num_items)  # 다시 샘플링

                user_input.append(u)  # [u1, u1,  u1,  ...]
                item_input.append(j)  # [pos_i, neg_j1, neg_j2, ...]
                labels.append(0)      # [1, 0,  0,  ...]

        return user_input, item_input, labels

## (4) Matrix Factorization / Multi-layer Perceptron / Nerual CF

### `(4)-1. GMF (Generalized Matrix Factorization)`

In [27]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [28]:
class GMP:

    def __init__(self, user_num, item_num):

        latent_features = 8

        # User embedding
        user = Input(shape=(1,), dtype='int32')
        user_embedding = Embedding(user_num, latent_features, input_length=user.shape[1])(user)
        user_embedding = Flatten()(user_embedding)

        # Item embedding
        item = Input(shape=(1,), dtype='int32')
        item_embedding = Embedding(item_num, latent_features, input_length=item.shape[1])(item)
        item_embedding = Flatten()(item_embedding)

        # Merge
        concatenated = Multiply()([user_embedding, item_embedding])

        # Output
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(concatenated) # 1,1 / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer='adam', loss='binary_crossentropy')

    def get_model(self):
        model = self.model
        return model


### `(4)-2. MLP (Multi-Layer Peceptron)`

In [29]:
class MLP:

    def __init__(self, user_num, item_num):

        # User embedding
        user = Input(shape=(1,), dtype='int32')
        user_embedding = Embedding(user_num, 32, input_length=user.shape[1])(user)
        user_embedding = Flatten()(user_embedding)

        # Item embedding
        item = Input(shape=(1,), dtype='int32')
        item_embedding = Embedding(item_num, 32, input_length=item.shape[1])(item)
        item_embedding = Flatten()(item_embedding)

        # Merge
        concatenated = Concatenate()([user_embedding, item_embedding])
        dropout = Dropout(rate=0.2)(concatenated)

        # Layer1
        layer_1 = Dense(units=64, activation='relu', name='layer1')(dropout)  # (64,1)
        dropout1 = Dropout(rate=0.2, name='dropout1')(layer_1)                # (64,1)
        batch_norm1 = BatchNormalization(name='batch_norm1')(dropout1)        # (64,1)

        # Layer2
        layer_2 = Dense(units=32, activation='relu', name='layer2')(batch_norm1)  # (32,1)
        dropout2 = Dropout(rate=0.2, name='dropout2')(layer_2)                    # (32,1)
        batch_norm2 = BatchNormalization(name='batch_norm2')(dropout2)            # (32,1)

        # Layer3
        layer_3 = Dense(units=16, activation='relu', name='layer3')(batch_norm2)  # (16,1)

        # Layer4
        layer_4 = Dense(units=8, activation='relu', name='layer4')(layer_3)  # (8,1)

        # Output
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(layer_4)  # (1,1) / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer='adam', loss='binary_crossentropy')

    def get_model(self):
        model = self.model
        return model

### `(4)-3. GMF + MLP = Nueral MF`

In [30]:
class NeuMF:

    def __init__(self, user_num, item_num):

        latent_features = 8

        # Input
        user = Input(shape=(1,), dtype='int32')
        item = Input(shape=(1,), dtype='int32')

        # User embedding for GMF
        gmf_user_embedding = Embedding(user_num, latent_features, input_length=user.shape[1])(user)
        gmf_user_embedding = Flatten()(gmf_user_embedding)

        # Item embedding for GMF
        gmf_item_embedding = Embedding(item_num, latent_features, input_length=item.shape[1])(item)
        gmf_item_embedding = Flatten()(gmf_item_embedding)

        # User embedding for MLP
        mlp_user_embedding = Embedding(user_num, 32, input_length=user.shape[1])(user)
        mlp_user_embedding = Flatten()(mlp_user_embedding)

        # Item embedding for MLP
        mlp_item_embedding = Embedding(item_num, 32, input_length=item.shape[1])(item)
        mlp_item_embedding = Flatten()(mlp_item_embedding)

        # GMF layers
        gmf_mul =  Multiply()([gmf_user_embedding, gmf_item_embedding])

        # MLP layers
        mlp_concat = Concatenate()([mlp_user_embedding, mlp_item_embedding])
        mlp_dropout = Dropout(0.2)(mlp_concat)

        # Layer1
        mlp_layer_1 = Dense(units=64, activation='relu', name='mlp_layer1')(mlp_dropout)  # (64,1)
        mlp_dropout1 = Dropout(rate=0.2, name='dropout1')(mlp_layer_1)                    # (64,1)
        mlp_batch_norm1 = BatchNormalization(name='batch_norm1')(mlp_dropout1)            # (64,1)

        # Layer2
        mlp_layer_2 = Dense(units=32, activation='relu', name='mlp_layer2')(mlp_batch_norm1)  # (32,1)
        mlp_dropout2 = Dropout(rate=0.2, name='dropout2')(mlp_layer_2)                        # (32,1)
        mlp_batch_norm2 = BatchNormalization(name='batch_norm2')(mlp_dropout2)                # (32,1)

        # Layer3
        mlp_layer_3 = Dense(units=16, activation='relu', name='mlp_layer3')(mlp_batch_norm2)  # (16,1)

        # Layer4
        mlp_layer_4 = Dense(units=8, activation='relu', name='mlp_layer4')(mlp_layer_3)       # (8,1)

        # merge GMF + MLP
        merged_vector = tf.keras.layers.concatenate([gmf_mul, mlp_layer_4])

        # Output layer
        output_layer = Dense(1, kernel_initializer='lecun_uniform', name='output_layer')(merged_vector) # 1,1 / h(8,1)초기화

        # Model
        self.model = Model([user, item], output_layer)
        self.model.compile(optimizer= 'adam', loss= 'binary_crossentropy')

    def get_model(self):
        model = self.model
        return model

# (5) Measure - metric

In [31]:
import numpy as np
import heapq

In [32]:
class Metric:

    def __init__(self):
        pass

    def get_hits(self, k_ranked, holdout):
        """
        hit 생성 함수
        hit := holdout(df_test의 item)이 K순위 내에 있는지 여부
        """
        for item in k_ranked:
            if item == holdout:
                return 1
        return 0

    def eval_rating(self, idx, test_ratings, test_negatives, K, model):
        """
        holdout(df_test의 item)이 K순위 내에 있는지 평가하는 함수
        """
        items = test_negatives[idx]      # negative items [neg_item_id, ... ] (1,100)
        user_idx = test_ratings[idx][0]  # [user_id, item_id][0]
        holdout = test_ratings[idx][1]   # [user_id, item_id][1]
        items.append(holdout)            # holdout 추가 [neg_item_id, ..., holdout] (1,101)

        # prediction
        predict_user = np.full(len(items), user_idx, dtype='int32').reshape(-1, 1)  # [[user_id], ...], (101, 1)
        np_items = np.array(items).reshape(-1, 1)                                   # [[item_id], ... ], (101, 1)

        predictions = model.predict([predict_user, np_items])
        predictions = predictions.flatten().tolist()
        item_to_pre_score = {item:pre for item, pre in zip(items, predictions)}

        # 점수가 높은 상위 k개 아이템 리스트
        k_ranked = heapq.nlargest(K, item_to_pre_score, key=item_to_pre_score.get)

        # holdout이 상위 K 순위에 포함 되는지 체크
        # {1:포함, 0:포함x}
        hits = self.get_hits(k_ranked, holdout)

        return hits

    def evaluate_top_k(self, df_neg, df_test, model, K=10):
        """
        TOP-K metric을 사용해 모델을 평가하는 함수
        """
        hits = []
        test_u = df_test['user_id'].values.tolist()
        test_i = df_test['item_id'].values.tolist()

        test_ratings = list(zip(test_u, test_i))
        df_neg = df_neg.drop(df_neg.columns[0], axis=1)
        test_negatives = df_neg.values.tolist()  # [[(user_id, item_id=holdout)], neg_item,... ] (1,100)

        # user 샘플링
        sample_idx_lst = np.random.choice(len(test_ratings), int(len(test_ratings) * 0.3))
        for user_idx in sample_idx_lst:  # 전체 사용: range(len(test_ratings))

            hitrate = self.eval_rating(user_idx, test_ratings, test_negatives, K, model)
            hits.append(hitrate)  # ex. [1,0,1,1,0,...] (1, df_test.shape[0])

        return hits

----

In [33]:
from sklearn.utils import shuffle

# (6) Run

In [34]:
user_data_shuff, item_data_shuff, label_data_shuff = shuffle(user_input, item_input, labels)
user_data_shuff = np.array(user_data_shuff).reshape(-1,1)
item_data_shuff = np.array(item_data_shuff).reshape(-1,1)
label_data_shuff = np.array(label_data_shuff).reshape(-1,1)

In [35]:
nmf = NeuMF(len(users), len(items))  # Neural Collaborative Filtering
model = nmf.get_model()
model.fit([user_data_shuff, item_data_shuff], label_data_shuff, epochs=20,
               batch_size=256, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x248a8c47490>

In [36]:
def calculate_top_k_metric(df_neg, df_test, model):
    metric = Metric()
    hit_lst = metric.evaluate_top_k(df_neg, df_test, model, K=10)
    hit = np.mean(hit_lst)
    
    return hit

In [37]:
# top-k metric
top_k_metric = calculate_top_k_metric(df_neg, df_test, model)
print('metric:', top_k_metric)



















metric: 0.8646666666666667


In [38]:

# user 한 명에 대한 prediction 예시
user_id = 0
user_candidate_item = np.array([134, 6783, 27888, 8362, 25]).reshape(-1, 1)
user_input = np.full(len(user_candidate_item), user_id, dtype='int32').reshape(-1, 1)

predictions = model.predict([user_input, user_candidate_item])
predictions = predictions.flatten().tolist()
item_to_pre_score = {item[0]: pre for item, pre in zip(user_candidate_item, predictions)}  # 후보 아이템 별 예측값
item_to_pre_score = dict(sorted(item_to_pre_score.items(), key=lambda x: x[1], reverse=True))

recommend_item_lst = list(item_to_pre_score.keys())
print('recommend:', recommend_item_lst)

recommend: [6783, 134, 25, 27888, 8362]


In [39]:
filepath = './data/lastfm-dataset-360K/lastfm-dataset-360K/'

In [40]:
# import numpy as np
# from sklearn.utils import shuffle
# # from Loader import Loader
# # from Metric import Metric
# # from model.NeuMF import NeuMF

# class Run:

#     def __init__(self):

#         # data 로드
#         loader = Loader()

#         print('start data load..')

#         num_neg = 4
#         uids, iids, self.df_train, self.df_test, \
#         self.df_neg, self.users, self.items, item_lookup = loader.load_dataset()
#         user_input, item_input, labels = loader.get_train_instances(uids, iids, num_neg, len(self.items))

#         print('end data load..')

#         # input data 준비
#         user_data_shuff, item_data_shuff, label_data_shuff = shuffle(user_input, item_input, labels)
#         self.user_data_shuff = np.array(user_data_shuff).reshape(-1,1)
#         self.item_data_shuff = np.array(item_data_shuff).reshape(-1,1)
#         self.label_data_shuff = np.array(label_data_shuff).reshape(-1,1)

#     def run(self):

#         nmf = NeuMF(len(self.users), len(self.items))  # Neural Collaborative Filtering
#         self.model = nmf.get_model()
#         self.model.fit([self.user_data_shuff, self.item_data_shuff], self.label_data_shuff, epochs=20,
#                        batch_size=256, verbose=1)

#         return self.model

#     def calculate_top_k_metric(self):
#         metric = Metric()
#         hit_lst = metric.evaluate_top_k(self.df_neg, self.df_test, self.model, K=10)
#         hit = np.mean(hit_lst)

#         return hit

# if __name__ == '__main__':

#     ncf = Run()
#     model = ncf.run()

#     # top-k metric
#     top_k_metric = ncf.calculate_top_k_metric()
#     print('metric:', top_k_metric)

#     # user 한 명에 대한 prediction 예시
#     user_id = 0
#     user_candidate_item = np.array([134, 6783, 27888, 8362, 25]).reshape(-1, 1)
#     user_input = np.full(len(user_candidate_item), user_id, dtype='int32').reshape(-1, 1)

#     predictions = model.predict([user_input, user_candidate_item])
#     predictions = predictions.flatten().tolist()
#     item_to_pre_score = {item[0]: pre for item, pre in zip(user_candidate_item, predictions)}  # 후보 아이템 별 예측값
#     item_to_pre_score = dict(sorted(item_to_pre_score.items(), key=lambda x: x[1], reverse=True))

#     recommend_item_lst = list(item_to_pre_score.keys())
#     print('recommend:', recommend_item_lst)