# 추천 시스템

## 추천 알고리즘의 진화
1. 협업 필터링    
    기존 사용자의 행동 정보를 분석해 해당 사용자와 비슷한 성향의 사용자들이 기존에 좋아했던 항목을 추천하는 기술.
    * 단점
        - 새로운 항목이 추가되는 경우 초기 정보가 부족해 문제가 생긴다.
        - 계산량이 많아 사용자 수가 많은 경우 효율적으로 추천할 수 없다.
        - 롱테일 문제. 사용자들의 관심이 적은 다수의 항목은 추천을 위한 정보가 부족하다.
2. 콘텐츠 기반 필터링    
    항목 자체를 분석하여 추천을 구현한다.

http://ocelma.net/MusicRecommendationDataset/lastfm-360K.html

#### 컬럼의 의미
1. user mboxshal : user id
2. musicbrainz artist id : artish mbid
3. artist name
4. plays

In [34]:
import pandas as pd
import os

fname = os.getenv('HOME')+'/aiffel/recommendata_iu/data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv'
col_names=['user_id', 'artist_MBID', 'artist', 'play']
data = pd.read_csv(fname, sep='\t', names=col_names)
data.head(10)

Unnamed: 0,user_id,artist_MBID,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,6531c8b1-76ea-4141-b270-eb1ac5b41375,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,21f3573f-10cf-44b3-aeaa-26cccd8448b5,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,c5db90c4-580d-4f33-b364-fbaa5a3a58b5,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,0639533a-0402-40ba-b6e0-18b067198b73,lunachicks,403


In [35]:
# artish_MBID는 필요가 없으므로 삭제한다.
use_col = ['user_id', 'artist', 'play']
data = data[use_col]
data.head()

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


In [36]:
# 검색을 쉽게 하기 위해 artist를 소문자로 바꿔준다.
data['artist'] = data['artist'].str.lower()
data.head()

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706


## EDA

In [37]:
# user 수
data['user_id'].nunique()

358868

In [38]:
# artist 수
data['artist'].nunique()

291346

In [39]:
# 인기 많은 artist
artist_count = data.groupby('artist')['user_id'].count()
artist_count.sort_values(ascending=False)[50:100]

artist
nightwish                   24222
blink-182                   23226
the offspring               22641
gorillaz                    22336
incubus                     21824
r.e.m.                      21592
the smiths                  21275
belle and sebastian         21231
feist                       21134
koяn                        21124
the strokes                 21106
britney spears              20816
modest mouse                20772
tool                        20741
interpol                    20536
snow patrol                 20445
pearl jam                   20381
evanescence                 20354
fall out boy                20316
queens of the stone age     20243
sufjan stevens              20084
röyksopp                    20070
pixies                      19982
tom waits                   19976
rage against the machine    19919
the kooks                   19851
bob marley                  19792
in flames                   19780
marilyn manson              19654
arcade 

In [40]:
# 유저별 몇 명의 아티스트를 듣고 있는지
user_count = data.groupby('user_id')['artist'].count()
user_count.sort_values(ascending=False).head(30)

user_id
2040b3d3cb47caace19e728090a88d6b1087aa8e    166
13c0df8a58cd3ca9e37d8e90662d2ad3c062eaa5    166
52bf0682648b44e6cc27ebe512dd4fa3455a11c2    150
7679da839e1a6c2d299c7fd6834fb9bf6e229224    139
8d8643cded0ac18e66a3c5a2b74de21fbca0e6b7    132
ce52b233f731dda7325aae6242440bd2a7095fca    131
0a7c493718902fe4590cad2f76db9abb6dce6fae    131
209b31e32790bdaa80e76ce9bb97e13e3980e1a5    128
e710c9f69c2d07f914e7b1fa533b3e9f27890d62    128
e85a4c61ca10501e09c979d411e984afc0a2393e    124
a0e128cc755f7a2d4404f0c2cb6a573d5c68f224    124
544719ea0ba39d378d2961453efcd8d57d54e616    123
5a07ab9831351b7658540a914ab56820168046ae    122
7075b80254d6280a46486fde3f224d66e06def12    122
45d990ea5813d9d7280be38e516eef43ce46e7fa    121
317d6010df721448103070d97a75b84164f3730d    121
c79fe89cc6c1bdc52a8f4dd350a31f5ac87828db    121
d483a856b7c113081d16ef48c9236f7957415059    120
cf7099ce5c5037f5cd7aef2770144219518e39bc    119
134ca93856c9ec74c6e1ecc95b1d1aadf185e53f    118
b4609162c3772ea49a89ecd2610424ef

In [41]:
user_count.describe()

count    358868.000000
mean         48.863234
std           8.524272
min           1.000000
25%          46.000000
50%          49.000000
75%          51.000000
max         166.000000
Name: artist, dtype: float64

In [42]:
# 유저별 play 횟수 중앙값에 대한 통계
user_median=data.groupby('user_id')['play'].median()
user_median.describe()

count    358868.000000
mean        142.187676
std         213.089902
min           1.000000
25%          32.000000
50%          83.000000
75%         180.000000
max       50142.000000
Name: play, dtype: float64

## 초기 정보 세팅

In [43]:
# 내가 좋아하는 5명 artist
my_favorite = ['muse', 'ed sheeran', 'maroon5', 'nicki minaj', 'taylor swift']

my_playlist = pd.DataFrame({'user_id': ['hyelim']*5, 'artist': my_favorite, 'play':[30]*5})
if not data.isin({'user_id':['hyelim']})['user_id'].any():  
    data = data.append(my_playlist) 

data.tail(10)

Unnamed: 0,user_id,artist,play
17535650,"sep 20, 2008",turbostaat,12
17535651,"sep 20, 2008",cuba missouri,11
17535652,"sep 20, 2008",little man tate,11
17535653,"sep 20, 2008",sigur rós,10
17535654,"sep 20, 2008",the smiths,10
0,hyelim,muse,30
1,hyelim,ed sheeran,30
2,hyelim,maroon5,30
3,hyelim,nicki minaj,30
4,hyelim,taylor swift,30


## 전처리

user와 artist에 번호를 붙인다 = indexing

In [44]:
user_unique = data['user_id'].unique()
artist_unique = data['artist'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
artist_to_idx = {v:k for k,v in enumerate(artist_unique)}

In [45]:
user_to_idx['hyelim']

358868

In [46]:
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):
    print('user_id indexing succes')
    data['user_id'] = temp_user_data
else:
    print('user_id indexing fail')
    
temp_artist_data = data['artist'].map(artist_to_idx.get).dropna()
if len(temp_artist_data) == len(data):
    print('artist indexing succes')
    data['artist'] = temp_artist_data
else:
    print('artist indexing fail')

user_id indexing succes
artist indexing succes


## 명시적/암묵적 평가
#### 1. Explicit Dataset (명시적)
* 유저가 선호도를 직접 표현.
* 유저들의 선호도를 명확하게 알 수 있으나 데이터를 얻기가 힘듦.

#### 2. Implicit Dataset (암묵적)
* 간접적으로 선호, 취향을 나타냄.
* 알아둬야할 특징
    1. 부정적인 피드백이 없다.
    2. 잡음이 많다.
    3. 수치는 신뢰도를 의미한다.
    4. 평가는 적절한 방법을 고민해야 한다.   


In [47]:
# 1회만 플레이한 데이터 비율
only_one_play = data[data['play']<2]
one, all_data = len(only_one_play), len(data)
print(f'{one}, {all_data}')
print(f'ratio: {one/all_data:.2%}')

147740, 17535660
ratio: 0.84%


## Matrix Factorization(MF)
평가 행렬의 행과 열의 수가 많아지면 메모리가 많이 필요하게 된다.    
이에 좋은 해결방안은 CSR Matrix를 사용하는 것이다.    
https://lovit.github.io/nlp/machine%20learning/2018/04/09/sparse_mtarix_handling/#csr-matrix

In [48]:
from scipy.sparse import csr_matrix
# user 수
num_user = data['user_id'].nunique()
# artist 수
num_artist = data['artist'].nunique()

csr_data = csr_matrix((data.play, (data.user_id, data.artist)), shape=(num_user, num_artist))
csr_data

<358869x291347 sparse matrix of type '<class 'numpy.longlong'>'
	with 17535578 stored elements in Compressed Sparse Row format>

In [49]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [50]:
# 모델 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [51]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<291347x358869 sparse matrix of type '<class 'numpy.longlong'>'
	with 17535578 stored elements in Compressed Sparse Column format>

In [52]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [53]:
hyelim, muse = user_to_idx['hyelim'], artist_to_idx['muse']
hyelim_vector, muse_vector = als_model.user_factors[hyelim], als_model.item_factors[muse]

In [54]:
hyelim_vector.shape, muse_vector.shape

((100,), (100,))

In [55]:
np.dot(hyelim_vector, muse_vector)

0.27387065

In [56]:
queen = artist_to_idx['queen']
queen_vector = als_model.item_factors[queen]
np.dot(hyelim_vector, queen_vector)

0.2367901

In [57]:
# 비슷한 아티스트 찾기
favorite_artist = 'ed sheeran'
artist_id = artist_to_idx[favorite_artist]
similar_artist = als_model.similar_items(artist_id, N=15)
similar_artist

[(157993, 1.0000001),
 (157994, 0.9803823),
 (104282, 0.97246194),
 (159710, 0.97245663),
 (156798, 0.97235817),
 (156799, 0.97160715),
 (148349, 0.9713299),
 (280347, 0.9712644),
 (112914, 0.97114736),
 (122556, 0.97114),
 (283403, 0.97112435),
 (280345, 0.9710865),
 (93609, 0.97108084),
 (218417, 0.971054),
 (280346, 0.97098076)]

In [58]:
idx_to_artist = {v:k for k,v in artist_to_idx.items()}
[idx_to_artist[i[0]] for i in similar_artist]

['ed sheeran',
 'george, inara',
 'dick prall',
 'dayna manning',
 'pet engine',
 'chris shaffer',
 'flere farver',
 'g-bach',
 'continental drifters',
 'alliancen',
 'nancy tucker',
 'under mistanke',
 'rose cousins',
 'brian wright',
 'hustlerne']

In [59]:
def get_similar_artist(artist_name: str):
    artist_id = artist_to_idx[artist_name]
    similar_artist = als_model.similar_items(artist_id)
    similar_artist = [idx_to_artist[i[0]] for i in similar_artist]
    return similar_artist

In [60]:
get_similar_artist('nicki minaj')

['nicki minaj',
 'teairra mari',
 'chrishan',
 'shareefa',
 'drag-on',
 'adina howard',
 'remy ma',
 'electrik red',
 'young gunz',
 'tha riot squad']

In [61]:
# 유저에게 아티스트 추천하기
user = user_to_idx['hyelim']
# recommend에서는 user*item CSR Matrix를 받습니다.
artist_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
artist_recommended

[(62, 0.2812308),
 (247, 0.27966008),
 (5, 0.26867846),
 (217, 0.2590966),
 (28, 0.25396386),
 (910, 0.25253657),
 (268, 0.24542755),
 (418, 0.24530497),
 (473, 0.24214755),
 (490, 0.23941675),
 (352, 0.23819721),
 (694, 0.23706716),
 (75, 0.23679012),
 (773, 0.22893448),
 (503, 0.2265356),
 (409, 0.2212239),
 (279, 0.21597335),
 (55, 0.21524605),
 (1170, 0.21517622),
 (1018, 0.21176943)]

In [62]:
[idx_to_artist[i[0]] for i in artist_recommended]

['coldplay',
 'the beatles',
 'red hot chili peppers',
 'radiohead',
 'the killers',
 'nirvana',
 'pink floyd',
 'u2',
 'placebo',
 'oasis',
 'linkin park',
 'foo fighters',
 'queen',
 'nine inch nails',
 'green day',
 'amy winehouse',
 'led zeppelin',
 'arctic monkeys',
 'daft punk',
 'the smashing pumpkins']

In [63]:
coldplay = artist_to_idx['coldplay']
explain = als_model.explain(user, csr_data, itemid=coldplay)

[(idx_to_artist[i[0]], i[1]) for i in explain[1]]

[('muse', 0.26107202172669375),
 ('taylor swift', 0.029306265639813263),
 ('nicki minaj', 0.00235511599904646),
 ('maroon5', -0.0009112972936581024),
 ('ed sheeran', -0.0038653179910136544)]