# 목표
- 추천시스템 개념과 목적
- implicit 라이브러리를 활용하여 matrix factorization(MF)기반의 추천 모델
- 음악 감상 기록을 활용하여 비슷한 아티스트를 찾고 아티스트 추천
- 추천 시스템에서 자주 사용되는 데이터 구조인 CSR Matrix
- Explicit data와 implicit data의 차이점을 익힌다
- 새로운 데이터셋으로 직접 추천 모델을 만든다.

### 데이터 열어보기

In [1]:
import pandas as pd
import os
fname = os.getenv("HOME")+'/aiffel/recommendata_iu/data/lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv'
col_names = ['user_id','artist_MBID','artist','play']
data = pd.read_csv(fname, sep='\t', names = col_names)
data.head(10)

Unnamed: 0,user_id,artist_MBID,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,8bfac288-ccc5-448d-9573-c33ea2aa5c30,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,6531c8b1-76ea-4141-b270-eb1ac5b41375,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,21f3573f-10cf-44b3-aeaa-26cccd8448b5,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,c5db90c4-580d-4f33-b364-fbaa5a3a58b5,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,0639533a-0402-40ba-b6e0-18b067198b73,lunachicks,403


### artist_MBID 제거

In [2]:
using_cols = ['user_id','artist','play']
data = data[using_cols]
data.head(10)

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403


### artist문자열 소문자로

In [3]:
data['artist']=data['artist'].str.lower()
data.head(10)

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403


### 첫번째 유저의 플레이리스트

In [4]:
condition = (data['user_id']==data.loc[0, 'user_id'])
data.loc[condition]

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403


# 데이터 탐색
### 유저수, 아티스트수, 인기많은 아티스트
### 유저들이 몇 명의 아티스트를 듣고 있는지 통계
### 유저 play 횟수 중앙값에 대한 통계
- ``` pandas.DataFrame.nunique() ``` : 특정 컬럼에 포함된 유니크한 데이터 개수

### 유저 수

In [5]:
data['user_id'].nunique()

358868

### 아티스트 수

In [6]:
data['artist'].nunique()

291346

### 인기많은 아티스트

In [7]:
artist_count = data.groupby('artist')['user_id'].count()
artist_count.sort_values(ascending=False).head(50)

artist
radiohead                77254
the beatles              76245
coldplay                 66658
red hot chili peppers    48924
muse                     46954
metallica                45233
pink floyd               44443
the killers              41229
linkin park              39773
nirvana                  39479
system of a down         37267
queen                    34174
u2                       33206
daft punk                33001
the cure                 32624
led zeppelin             32295
placebo                  32072
depeche mode             31916
david bowie              31862
bob dylan                31799
death cab for cutie      31482
arctic monkeys           30348
foo fighters             30144
air                      29795
the rolling stones       29754
nine inch nails          28946
sigur rós                28901
green day                28732
massive attack           28691
moby                     28232
amy winehouse            28210
portishead               28072
r

### 유저별 몇 명의 아티스트를 듣고 있는지

In [8]:
user_count = data.groupby('user_id')['artist'].count()
user_count.describe()

count    358868.000000
mean         48.863234
std           8.524272
min           1.000000
25%          46.000000
50%          49.000000
75%          51.000000
max         166.000000
Name: artist, dtype: float64

### 유저별 play횟수 중앙값에 대한 통계

In [9]:
user_median = data.groupby('user_id')['play'].median()
user_median.describe()

count    358868.000000
mean        142.187676
std         213.089902
min           1.000000
25%          32.000000
50%          83.000000
75%         180.000000
max       50142.000000
Name: play, dtype: float64

# 모델 검증을 위한 사용자 초기 정보 세팅
- 추천시스템의 추후 검증 과정을 위해 무엇을 좋아하는지 정보를 데이터셋에 5개이상 추가해야한다.

In [10]:
# 본인이 좋아하는 아티스트
my_favorite=['coldplay','maroon5','daft punk','oasis','queen']

# 새로운 사람이 아티스트노래30회씩
my_playlist = pd.DataFrame({'user_id':['yong']*5, 'artist':my_favorite, 'play':[30]*5})
if not data.isin({'user_id':['yong']})['user_id'].any():
    data = data.append(my_playlist)
data.tail(10)

Unnamed: 0,user_id,artist,play
17535650,"sep 20, 2008",turbostaat,12
17535651,"sep 20, 2008",cuba missouri,11
17535652,"sep 20, 2008",little man tate,11
17535653,"sep 20, 2008",sigur rós,10
17535654,"sep 20, 2008",the smiths,10
0,yong,coldplay,30
1,yong,maroon5,30
2,yong,daft punk,30
3,yong,oasis,30
4,yong,queen,30


# 데이터 전처리
- 데이터 관리를 쉽게 하기 위하여 번호를 붙인다.
- user,artist에 각각 번호를 붙인다.(indexing)
- ``` pandas.DataFrame.unique() ``` : 특정 컬럼에 포함된 유니크한 데이터만 모아 준다.

In [11]:
# 유저, 아티스트를 찾는 코드
user_unique = data['user_id'].unique()
artist_unique = data['artist'].unique()

# 유저, 아티스트 indexing
user_to_idx = {v:k for k,v in enumerate(user_unique)}
artist_to_idx = {v:k for k,v in enumerate(artist_unique)}

In [12]:
# 인덱싱확인
print(user_to_idx['yong'])
print(artist_to_idx['queen'])

358868
75


In [13]:
# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# user_to_idx.get으로 user_id컬럼의 모든 값을 인덱싱값으로

# user_to_idx 결측치 제거
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_artist_data = data['artist'].map(artist_to_idx.get).dropna()
if len(temp_artist_data) == len(data):
    print('artist column indexing OK!!')
    data['artist'] = temp_artist_data
else:
    print('artist column indexing Fail!!')

data

user_id column indexing OK!!
artist column indexing OK!!


Unnamed: 0,user_id,artist,play
0,0,0,2137
1,0,1,1099
2,0,2,897
3,0,3,717
4,0,4,706
...,...,...,...
0,358868,62,30
1,358868,270115,30
2,358868,1170,30
3,358868,490,30


# 사용자의 명시적 암묵적 평가

In [14]:
# 1회만 플레이한 데이터 비율
only_one = data[data['play']==1]
one, all_data = len(only_one), len(data)
print('{},{}'.format(one, all_data))
print('Ratio of only_one over all data is {:.2%}'.format(one/all_data))

147739,17535660
Ratio of only_one over all data is 0.84%


# CSR(Compressed Sparse Row) Matrix

- 거대한 행렬을 조그마한 메모리에 올려놓는것은 불가능하므로 CSR을 사용한다.
- Sparse한 matrix에서 0이 아닌 데이터로 채워지는 데이터의 값과 좌표 정보만으로 구성하여 메모리 사용량을 최소화하고 전체데이터와 동일한 행렬을 표현할 수 있도록 하는 데이터 구조

![%E1%84%89%E1%85%B3%E1%84%8F%E1%85%B3%E1%84%85%E1%85%B5%E1%86%AB%E1%84%89%E1%85%A3%E1%86%BA%202021-11-02%2016.32.31.png](attachment:%E1%84%89%E1%85%B3%E1%84%8F%E1%85%B3%E1%84%85%E1%85%B5%E1%86%AB%E1%84%89%E1%85%A3%E1%86%BA%202021-11-02%2016.32.31.png)

In [16]:
from scipy.sparse import csr_matrix
num_user = data['user_id'].nunique()
num_artist = data['artist'].nunique()

csr_data = csr_matrix((data.play,(data.user_id, data.artist)), shape=(num_user, num_artist))
csr_data

<358869x291347 sparse matrix of type '<class 'numpy.longlong'>'
	with 17535578 stored elements in Compressed Sparse Row format>

# MF 모델 학습
### implicit패키지활용
- implicit 패키지는 암묵적 데이터셋을 사용하는 모델을 빠르게 학습할 수 있는 패키지
- ```als```모델을 사용

In [17]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

### AlternatingLeastSquares
- 1. factors : 유저와 아이템의 벡터를 몇 차원으로
- 2. regularization : 과적합 방지를 위해 정규화 값을 얼마나 사용하는지
- 3. use_gpu : GPU사용하는지
- 4. iterations: epochs

In [18]:
# 모델선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [19]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<291347x358869 sparse matrix of type '<class 'numpy.longlong'>'
	with 17535578 stored elements in Compressed Sparse Column format>

In [20]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

### 두 벡터의 곱

In [22]:
yong, maroon5 = user_to_idx['yong'], artist_to_idx['maroon5']
yong_vector, maroon5_vector = als_model.user_factors[yong], als_model.item_factors[maroon5]

In [23]:
yong_vector

array([-1.3293152 , -0.53870493, -0.25375843, -0.46984172, -1.0050867 ,
        0.7488759 , -0.73325974,  1.5992128 ,  0.26788104, -0.6425824 ,
        0.94047374, -0.38389552,  0.9766642 ,  0.7465302 ,  0.1429866 ,
        0.17457859,  2.3758366 , -1.235189  ,  1.7049726 ,  0.01022277,
        1.2930738 ,  0.4556823 , -1.3234217 ,  1.2751338 ,  1.1121773 ,
       -0.7007179 , -0.39152354,  0.7201761 ,  0.02575181,  0.18190391,
        1.4648052 ,  1.1828033 ,  0.06965382, -1.6003385 ,  0.12759186,
       -0.74724233, -0.08103454,  0.39196408, -0.57374465, -1.1272417 ,
       -0.05720612,  1.5173415 , -0.41622752, -0.6668502 ,  0.7074256 ,
       -0.28986424,  0.33625495,  0.05615618, -0.47049674,  1.2968152 ,
       -0.08997826, -1.1646594 ,  0.9775686 ,  0.33289024, -1.3626711 ,
        0.21147731,  0.24545835, -0.6523513 ,  0.5913997 , -0.19655965,
        0.8744882 ,  0.31352067,  1.1539861 , -0.41238132, -0.7384179 ,
       -0.16837297,  0.7840831 , -0.18939033, -1.143171  , -0.06

In [24]:
maroon5_vector

array([0.00510025, 0.00509578, 0.0044086 , 0.00426884, 0.00486897,
       0.00414159, 0.00454335, 0.00466499, 0.00492888, 0.00400133,
       0.00515772, 0.00529287, 0.00401989, 0.00528362, 0.00471784,
       0.00359867, 0.00440048, 0.00509222, 0.0038756 , 0.00479221,
       0.0043562 , 0.00458725, 0.00548376, 0.00416573, 0.00452606,
       0.00475858, 0.0060093 , 0.00333826, 0.00449468, 0.00513978,
       0.00326284, 0.00387609, 0.00468378, 0.00589765, 0.00519247,
       0.00585251, 0.00315931, 0.00470523, 0.00581049, 0.00422341,
       0.0039963 , 0.00280326, 0.00481117, 0.00599739, 0.00439988,
       0.00441042, 0.00438404, 0.00494295, 0.00433262, 0.00414203,
       0.00451828, 0.00414409, 0.00497036, 0.00608997, 0.00437229,
       0.0049862 , 0.00459914, 0.00477757, 0.00484842, 0.00405429,
       0.00308478, 0.00309919, 0.00461385, 0.00540596, 0.00570166,
       0.00485315, 0.00425812, 0.00433903, 0.00593985, 0.00436103,
       0.00611049, 0.00566968, 0.0051437 , 0.0054912 , 0.00573

In [25]:
np.dot(yong_vector, maroon5_vector)

-0.0013867511

In [26]:
queen = artist_to_idx['queen']
queen_vector = als_model.item_factors[queen]
np.dot(yong_vector, queen_vector)

0.5282145

### 비슷한 아티스트 찾기
- ``` AlternatingLeastSquares```의 ```similar_items``` 메소드로 비슷한 아티스트 찾기

In [28]:
artist_id = artist_to_idx['coldplay']
similar_artist = als_model.similar_items(artist_id, N=15)
similar_artist

[(62, 1.0),
 (277, 0.9867721),
 (28, 0.97780466),
 (5, 0.9734462),
 (473, 0.9729594),
 (217, 0.9692197),
 (247, 0.96164393),
 (490, 0.9500382),
 (694, 0.9486332),
 (1018, 0.9441598),
 (910, 0.943473),
 (418, 0.9428705),
 (782, 0.9392625),
 (268, 0.9356048),
 (531, 0.93090904)]

In [29]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성합니다. 
idx_to_artist = {v:k for k,v in artist_to_idx.items()}
[idx_to_artist[i[0]] for i in similar_artist]

['coldplay',
 'muse',
 'the killers',
 'red hot chili peppers',
 'placebo',
 'radiohead',
 'the beatles',
 'oasis',
 'foo fighters',
 'the smashing pumpkins',
 'nirvana',
 'u2',
 'the white stripes',
 'pink floyd',
 'depeche mode']

In [30]:
# to idx 반복하기 위한 함수
def get_similar_artist(artist_name: str):
    artist_id = artist_to_idx[artist_name]
    similar_artist = als_model.similar_items(artist_id)
    similar_artist = [idx_to_artist[i[0]] for i in similar_artist]
    return similar_artist

In [33]:

get_similar_artist('jackson')

['jackson',
 'mike oconnel',
 'franz & shape',
 'steve fisk & benjamin gibbard',
 'jackson & his computer band',
 'dj shadow & coldcut',
 'eighteenth street lounge soundtracks',
 'the disk orchestra',
 'erik satin',
 'tommy hools']

### 아티스트 추천
- ``` AlternatingLeastSquares``` 클래스의 ```recommend``` 메소드를 통하여 아티스트 추천
- ``` filter_already_liked_items``` 는 유저가 이미 평가한 아이템은 제외

In [35]:
user=user_to_idx['yong']

artist_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
artist_recommended

[(247, 0.5979566),
 (277, 0.5769769),
 (217, 0.57416916),
 (5, 0.57133204),
 (268, 0.5515888),
 (28, 0.54896134),
 (910, 0.5422063),
 (418, 0.53937805),
 (473, 0.5297363),
 (531, 0.51114583),
 (694, 0.51017255),
 (776, 0.49651554),
 (1018, 0.4930751),
 (782, 0.49257118),
 (55, 0.48595804),
 (773, 0.484257),
 (279, 0.48241213),
 (1500, 0.46713462),
 (457, 0.4622238),
 (1406, 0.45753008)]

In [36]:
[idx_to_artist[i[0]] for i in artist_recommended]

['the beatles',
 'muse',
 'radiohead',
 'red hot chili peppers',
 'pink floyd',
 'the killers',
 'nirvana',
 'u2',
 'placebo',
 'depeche mode',
 'foo fighters',
 'the cure',
 'the smashing pumpkins',
 'the white stripes',
 'arctic monkeys',
 'nine inch nails',
 'led zeppelin',
 'gorillaz',
 'franz ferdinand',
 'the doors']

### 추천 기여한 정도 확인

In [37]:
beatles = artist_to_idx['beatles']
explain = als_model.explain(user, csr_data, itemid=beatles)

In [38]:
[(idx_to_artist[i[0]], i[1]) for i in explain[1]]

[('queen', 0.004301023863192517),
 ('daft punk', 0.0029766700293032515),
 ('coldplay', 0.0009760684992351375),
 ('maroon5', 1.4972575158953023e-05),
 ('oasis', -0.0012497770950446688)]