# Item based collaborative filtering

In [165]:
import pandas as pd
from collections import defaultdict
%matplotlib inline

- 데이터가 있는 디렉토리 경로 설정

In [166]:
home_dir = './music-ds/'

## CSV파일을 읽어 데이터프레임 만들기

In [167]:
## music streaming log data
streamings = pd.read_csv(home_dir+'streaming_ds.csv', 
                                names = ['MEM_UNO', 'SONG_ID', 'ARTIST_ID', 'GENRE_CODE', 'REG_DATE'], index_col = 1)
#streamings = streamings.convert_objects(convert_numeric=True).dropna()
genre = pd.read_csv(home_dir+'genre.csv', 
                              names = ['GENRE_CODE', 'GENRE_NAME'], index_col = 0)
artist = pd.read_csv(home_dir+'artist.csv', 
                              names = ['ARTIST_ID', 'ARTIST_NAME'], index_col = 0)

In [169]:
streamings.head()

Unnamed: 0_level_0,MEM_UNO,ARTIST_ID,GENRE_CODE,REG_DATE
SONG_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
15961709,302774898,14948305,L0602,20150901081803
15961710,305823986,14948305,L0602,20150901084922
15962335,303507290,14945136,L0902,20150901081553
15962336,303519720,14945123,L0902,20150901080234
15962336,301688860,14945123,L0902,20150901080406


In [170]:
genre.head()

Unnamed: 0_level_0,GENRE_NAME
GENRE_CODE,Unnamed: 1_level_1
L0001,유선
L0002,무선
L0003,내부
L0004,외부
L0005,WEB


In [171]:
artist.head()

Unnamed: 0_level_0,ARTIST_NAME
ARTIST_ID,Unnamed: 1_level_1
14935950,방실이
14935951,이예린
14935952,신윤정
14935953,제이 (J.ae)
14935954,장혜진


## 유저별 음원 set 만들기

In [172]:
#music_user_dict
music_user_dict = defaultdict(set)
for idx, row in streamings[:].iterrows():
    music_user_dict[idx].add(int(row.MEM_UNO))

## 유사한 음원 찾기

In [173]:
def nearest_neighbors(song_id, topn=10, threshold=3 ):
    item1 = music_user_dict[song_id]
    if len(item1) < threshold:
        return []
    nn = []
    for song_id2, item2 in music_user_dict.items():
        intersections = len(item1.intersection(item2))
        if intersections == 0 or song_id2 == song_id:
            continue
        unions = len(item1.union(item2))+1
        similarity = intersections / unions 
        nn.append([song_id2,similarity])
    return sorted(nn,key=lambda x: x[1], reverse=True)[:topn]

def print_artist(song_list):
    for song in song_list:
        try:
            if type(song) in { tuple, list} :
                song = song[0]
            song = int(song)
            artist_id = streamings.loc[song][:1].ARTIST_ID.tolist()[0]
            n = artist.loc[artist_id]['ARTIST_NAME']
            print(song, n)
        except:
            print(song, 'no-artist')

In [174]:
## 84699045 원더걸스 (Wonder Girls) 유사한 음원은?
ret = nearest_neighbors(84699045, topn=20)
print_artist([84699045]+ret)

84699045 원더걸스 (Wonder Girls)
84686902 BIGBANG
84581811 여자친구 (GFRIEND)
84568778 Apink (에이핑크)
84201067 BIGBANG
84847396 현아 (4Minute)
84369985 마마무
84168515 혁오 (hyukoh)
84372538 씨스타 (Sistar)
84372192 AOA
84686903 BIGBANG
84059724 Zion.T
84760929 Simon Dominic
84064039 백아연
82821208 Zion.T
84699325 자메즈 & 앤덥 & 송민호
84801754 소녀시대
84822793 SG워너비
84672668 유승우
84822791 SG워너비
82812257 혁오 (hyukoh)


## 유저의 관심도를 이용하여 음원 추천하기

In [175]:
def user_interests(uno):
    return streamings[streamings.MEM_UNO==uno].index.tolist()
    
def cal_score(sim, pre_score):
    return  sim if pre_score < sim else pre_score
    
def recommend_music(uno, topn=10):
    musics = defaultdict(float)
    interests = user_interests(uno)
    for song in interests:
        neighbors = nearest_neighbors(song)
        for (song2, sim) in neighbors:
            #if song2 in interests:
            #    continue
            pre = musics.get(song2, 0.0)
            musics[song2] = cal_score(sim, pre)

    return {'recommended' : sorted(musics.items(), key=lambda x: x[1], reverse=True)[:topn],
            'seens':interests}

In [176]:
ret = recommend_music(301863475, topn=20)
print('*** 추천 음악 ***')
print_artist(ret['recommended'])
print('*** 시청한 음악 ***')
print_artist(ret['seens'])

*** 추천 음악 ***
80696497 씨야 (SeeYa)
16098763 뱅크
75195354 플라이 투 더 스카이 (Fly To The Sky)
16128654 이기찬
48197709 엠씨더맥스 (M.C the MAX)
81028747 김연우
16131081 윤미래 (T)
52802787 프리스타일 (Free Style)
16174214 유엔 (UN)
16178434 SG워너비
42299486 Boyz II Men
80594467 Boyz II Men
16174405 no-artist
49619408 no-artist
49619412 no-artist
49619413 no-artist
49619440 no-artist
49619442 no-artist
49619443 no-artist
49619449 no-artist
*** 시청한 음악 ***
15965255 no-artist
16186222 테이 (Tei)
17465599 버즈 (Buzz)
40207122 하동균
49619416 Boyz II Men
82872166 비스트 (Beast)
83360622 제아 (브라운아이드걸스)
