# 추천시스템 스터디 by 김정학

##### https://github.com/lumiamitie/TIL/blob/master/python/recsys_in_python_101.md

### package import

In [13]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt

### 사용할 데이터

#### shared_articles.csv  ->  사용할 칼럼은 contentId, text, article. 나중에 컨텐츠 아이디를 추천할 때 사용할거고, TF-IDF 를 통해서 상품을 데이터화 할 때 사용할 것. 
#### users_interactions.csv  -> 사용할 칼럼은 eventType, contentId, personId
#### eventType은 어떤 활동을 했는지
#### contentId는 컨텐츠 고유 넘버, personId는 사용자 고유 넘버

In [14]:
articles_df = (
  pd.read_csv('shared_articles.csv').loc[lambda d: d['eventType'] == 'CONTENT SHARED']  
)

In [15]:
articles_df.head()

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,CONTENT SHARED,-7292285110016212249,4340306774493623681,8940341205206233829,,,,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en
3,1459194474,CONTENT SHARED,-6151852268067518688,3891637997717104548,-1457532940883382585,,,,HTML,https://cloudplatform.googleblog.com/2016/03/G...,Google Data Center 360° Tour,We're excited to share the Google Data Center ...,en
4,1459194497,CONTENT SHARED,2448026894306402386,4340306774493623681,8940341205206233829,,,,HTML,https://bitcoinmagazine.com/articles/ibm-wants...,"IBM Wants to ""Evolve the Internet"" With Blockc...",The Aite Group projects the blockchain market ...,en
5,1459194522,CONTENT SHARED,-2826566343807132236,4340306774493623681,8940341205206233829,,,,HTML,http://www.coindesk.com/ieee-blockchain-oxford...,IEEE to Talk Blockchain at Cloud Computing Oxf...,One of the largest and oldest organizations fo...,en


In [16]:
interaction_df = pd.read_csv('users_interactions.csv')

In [17]:
interaction_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


### interation_df에 칼럼을 추가해보자
#### eventType에 따라서 가중치를 준다. eventStrength라는 새로운 칼럼 생성

In [18]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 2.5, 
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,  
}

interaction_df['eventStrength'] = (
  interaction_df.loc[:, 'eventType'].apply(lambda d: event_type_strength[d])
)

In [19]:
interaction_df.head(5)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry,eventStrength
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,,1.0
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US,1.0
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,,1.0
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,,3.0
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,,1.0


#### cold start를 방지하기 위해서, (cold start란? 추천 시스템에서는 사용자에 대한 정보가 부족할 때 추천을 제공하기가 어려워진다.) 
#### 컨텐츠와 상호작용 횟수가 5이상인 사용자만 다루자.

In [20]:
interaction_df_over5 = (interaction_df
  .groupby('personId', group_keys=False)
  .apply(lambda df: df.assign(interactCnt = lambda d: d['contentId'].nunique()))
  .loc[lambda d: d['interactCnt'] >= 5]
)

#### 사용자 - 컨텐츠 pair마다 groupby를 하고 로그를 씌움으로써 분포를 평활하게 한다.

In [21]:
interaction_full_df = (
  interaction_df_over5
    .groupby(['personId', 'contentId'], as_index=False)['eventStrength']
    .sum()
    .assign(eventScore = lambda d: np.log2(1+d['eventStrength']))
)

interaction_full_df.head()

Unnamed: 0,personId,contentId,eventStrength,eventScore
0,-9223121837663643404,-8949113594875411859,1.0,1.0
1,-9223121837663643404,-8377626164558006982,1.0,1.0
2,-9223121837663643404,-8208801367848627943,1.0,1.0
3,-9223121837663643404,-8187220755213888616,1.0,1.0
4,-9223121837663643404,-7423191370472335463,8.0,3.169925


### train-test로 분리
#### 특별한건stratify 부분. stratify는 personId를 기준으로 데이터를 분리한다. 
#### 예를 들면 1번사용자가 10개의 컨텐츠와 상호작용을 하고 2번사용자가 5개의 컨텐츠와 상호작용을 했으면 1번은 8:2, 2번은 4:1로 나눈다. OK?

In [22]:
interaction_train, interaction_test = train_test_split(
    interaction_full_df,
    stratify=interaction_full_df['personId'], #persionID 기준으로 나눔
    test_size=0.2,
    random_state=42
)

In [23]:
interaction_full_indexed = interaction_full_df.set_index('personId')
interaction_train_indexed = interaction_train.set_index('personId')
interaction_test_indexed = interaction_test.set_index('personId')

### 추천시스템 모델 평가

#### 정확히 알 필요 없음. 
#### 작동하는 방식은 다음 과 같다.
1.유저가 상호작용한 적이 없는 100개의 상품을 샘플링한다.

2.테스트케이스에서 상호작용한 상품을 하나 추가해서 101개의 상품리스트를 만든다.

3.추천시스템 알고리즘에 따라서 101개의 상품의 추천 순위를 매긴다.

4.사용자와 상품에 대한 top-N 정확도를 계산한다. 우리는 Top-5와 Top-10을 고려할 것.

In [24]:
def get_items_interacted(person_id, interaction_df):
    interated_items = interaction_df.loc[person_id]['contentId']
    return set(interated_items if type(interated_items) == pd.Series else [interated_items])

In [25]:
class ModelEvaluator:
    def __init__(self, n_non_interacted=100):
        self.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = n_non_interacted
        
    def get_non_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = get_items_interacted(person_id, interaction_full_indexed)
        all_items = set(articles_df['contentId'])
        non_interacted_items = all_items - interacted_items
        
        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)
        
    def _verify_hit_top_n(self, item_id, recommend_items, topn):
        try:
            index = next(i for i, c in enumerate(recommend_items) if c == item_id)
        except:
            index = -1
        hit = int(index in range(0, topn))
        return hit, index
    
    def evaluate_model_for_user(self, model, person_id):
        interacted_values_testset = interaction_test_indexed.loc[person_id]
        if type(interacted_values_testset['contentId']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['contentId'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['contentId'])])
        
        interacted_items_count_testset = len(person_interacted_items_testset)
        
        # 특정 사용자에 대한 추천 순위 목록을 받아온다
        person_recs = model.recommend_items(
            person_id,
            items_to_ignore=get_items_interacted(person_id, interaction_train_indexed),
            topn=10000000000
        )
        
        hits_at_5_count = 0
        hits_at_10_count = 0
        
        # test셋에서 사용자가 상호작용한 모든 항목에 대해 반복한다
        for item_id in person_interacted_items_testset:
            
            # 사용자가 상호작용하지 않은 100개 항목을 샘플링한다
            non_interacted_items_sample = self.get_non_interacted_items_sample(
                person_id,
                sample_size=self.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS,
                seed=item_id % (2**32)
            )
            
            # 현재 선택한 item_id(상호작용 있었던 항목)와 100개 랜덤 샘플을 합친다
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))
            
            # 추천 결과물 중에서 현재 선택한 item_id와 100개 랜덤 샘플의 결과물로만 필터링한다
            valid_recs_df = person_recs[person_recs['contentId'].isin(items_to_filter_recs)]
            valid_recs = valid_recs_df['contentId'].values
            
            # 현재 선택한 item_id가 Top-N 추천 결과 안에 있는지 확인한다
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10
            
        # Recall 값은 상호작용 있었던 항목들 중에서 관련없는 항목들과 섞였을 때 Top-N에 오른 항목들의 비율로 나타낼 수 있다
        recall_at_5 = hits_at_5_count / interacted_items_count_testset
        recall_at_10 = hits_at_10_count / interacted_items_count_testset
        
        person_metrics = {
            'hits@5_count': hits_at_5_count,
            'hits@10_count': hits_at_10_count,
            'interacted_count': interacted_items_count_testset,
            'recall@5': recall_at_5,
            'recall@10': recall_at_10
        }
        return person_metrics
    
    def evaluate_model(self, model):
        people_metrics = []
        for idx, person_id in enumerate(list(interaction_test_indexed.index.unique().values)):
            person_metrics = self.evaluate_model_for_user(model, person_id)
            person_metrics['_person_id'] = person_id
            people_metrics.append(person_metrics)

        print('{} users processed'.format(idx))
        
        detailed_result = (
            pd.DataFrame(people_metrics)
              .sort_values('interacted_count', ascending=False)
        )
        
        global_recall_at_5 = detailed_result['hits@5_count'].sum() / detailed_result['interacted_count'].sum()
        global_recall_at_10 = detailed_result['hits@10_count'].sum() / detailed_result['interacted_count'].sum()
        
        global_metrics = {
            'model_name': model.get_model_name(),
            'recall@5': global_recall_at_5,
            'recall@10': global_recall_at_10
        }
        
        return global_metrics, detailed_result

In [26]:
model_evaluator = ModelEvaluator(n_non_interacted=100)

### Popularity Model

personal되지 않은 가장 보편적인 추천방식. 

사람들이 많이 인터렉션을 한 컨텐츠를 기준으로 사용자에게 추천,

In [27]:
item_popularity = (interaction_full_df
 .groupby('contentId')['eventStrength'].sum()
 .sort_values(ascending=False)
 .reset_index()
)

item_popularity.head(10)

Unnamed: 0,contentId,eventStrength
0,-4029704725707465084,457.5
1,-2358756719610361882,361.0
2,-6783772548752091658,357.0
3,8657408509986329668,338.0
4,-133139342397538859,336.5
5,-8208801367848627943,327.5
6,-6843047699859121724,315.0
7,2581138407738454418,310.0
8,2857117417189640073,308.5
9,-1633984990770981161,298.0


In [30]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # 인기상품 중에서 사용자가 보지 않았던 상품을 추천한다
        recommendations = (
          self.popularity_df[~self.popularity_df['contentId'].isin(items_to_ignore)]
            .sort_values('eventStrength', ascending=False)
            .head(topn)
        )
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['eventStrength', 'contentId', 'title', 'url', 'lang']]
            )
            
        return recommendations

In [31]:
popularity_model = PopularityRecommender(item_popularity, articles_df)

In [32]:
print('Popularity 추천 모형을 평가합니다')
pop_global_metrics, pop_detailed_results = model_evaluator.evaluate_model(popularity_model)
print('Global Metrics:\n{}'.format(pop_global_metrics))
pop_detailed_results.head(10)

Popularity 추천 모형을 평가합니다
1139 users processed
Global Metrics:
{'model_name': 'Popularity', 'recall@5': 0.22423932498082333, 'recall@10': 0.35067757606750194}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
76,22,45,192,0.114583,0.234375,3609194402293569455
17,13,30,134,0.097015,0.223881,-2626634673110551643
16,14,32,130,0.107692,0.246154,-1032019229384696495
10,6,11,117,0.051282,0.094017,-1443636648652872475
82,24,35,88,0.272727,0.397727,-2979881261169775358
161,11,18,80,0.1375,0.225,-3596626804281480007
65,15,26,73,0.205479,0.356164,1116121227607581999
81,14,19,69,0.202899,0.275362,692689608292948411
106,12,19,69,0.173913,0.275362,-9016528795238256703
52,20,29,68,0.294118,0.426471,3636910968448833585


### Content-based Filtering Model

유저의 개인화는 아직 포함 안되있고, 대신에 상품들간의 유사도를 기반으로 추천한다. 

여기서는 TF-IDF를 사용할 것이다. 구현은 안할 거고 제공되는 패키지를 사용할 것.

https://wikidocs.net/31698

In [33]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Baek\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [21]:
# stopwords를 제거한다
stopwords_list = stopwords.words('english') + stopwords.words('portuguese')

# 벡터의 길이는 5000, unigram과 bigram을 사용하고, stopwords를 제거하도록 학습한다
vectorizer = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1, 2), #unigram과 bigram의 상위 호환, 1, 2 개 다 쓴다
    min_df=0.003, # 빈도수가 너무 낮으면 버린다
    max_df=0.5, # 빈도수가 너무 많으면 버린다.
    max_features=5000, # 단어 수를 5000개로 한정한다.
    stop_words=stopwords_list
)

item_ids = articles_df['contentId'].tolist()
tfidf_matrix = vectorizer.fit_transform(articles_df['title'] + '' + articles_df['text'])
tfidf_feature_names = vectorizer.get_feature_names()

In [22]:
tfidf_matrix

<3047x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 638928 stored elements in Compressed Sparse Row format>

In [23]:
def get_item_profile(item_id):
    idx = item_ids.index(item_id)
    item_profile = tfidf_matrix[idx:idx+1]
    return item_profile

def get_item_profiles(ids):
    item_profiles_list = [get_item_profile(x) for x in ids]
    item_profiles = scipy.sparse.vstack(item_profiles_list)
    return item_profiles

def build_user_profile(person_id, interaction_indexed_df):
    interaction_person_df = interaction_indexed_df.loc[person_id]
    user_item_profiles = get_item_profiles(interaction_person_df['contentId'])
    
    user_item_strengths = np.array(interaction_person_df['eventStrength']).reshape(-1, 1)
    
    # 상호작용 강도를 바탕으로 가중치를 부여하여 평균 계산한다
    user_item_strengths_weighted_avg = \
        np.sum(user_item_profiles.multiply(user_item_strengths), axis=0) /\
        np.sum(user_item_strengths)
        
    user_profile_norm = sklearn.preprocessing.normalize(user_item_strengths_weighted_avg)
    return user_profile_norm

def build_user_profiles():
    interaction_indexed_df = (interaction_full_df
        .loc[lambda d: d['contentId'].isin(articles_df['contentId'])]
        .set_index('personId')
    )
    user_profiles = {}
    
    for person_id in interaction_indexed_df.index.unique():
        user_profiles[person_id] = build_user_profile(person_id, interaction_indexed_df)
        
    return user_profiles

In [24]:
user_profiles = build_user_profiles()
len(user_profiles)

1140

In [25]:
sorted(zip(tfidf_feature_names,user_profiles[-1479311724257856983].flatten().tolist()),key=lambda x: -x[1])[:20]

[('learning', 0.312070436108696),
 ('machine learning', 0.26922437396742877),
 ('machine', 0.2569967199883568),
 ('data', 0.18662975476681168),
 ('google', 0.17138514831291113),
 ('ai', 0.1417324892238493),
 ('graph', 0.1140257714421146),
 ('algorithms', 0.10941951457330623),
 ('like', 0.09611338386881395),
 ('language', 0.08501016167639383),
 ('models', 0.08178517693379499),
 ('search', 0.08109558706991335),
 ('algorithm', 0.07694303717423454),
 ('people', 0.07680017907622869),
 ('deep learning', 0.07661003919620854),
 ('research', 0.07352844947822106),
 ('spark', 0.07334145084389977),
 ('deep', 0.07290453811330244),
 ('company', 0.06913717983616013),
 ('model', 0.06740433138531153)]

In [26]:
class ContentBasedRecommender:
    
    MODEL_NAME = 'Content-Based'
    
    def __init__(self, item_ids, items_df=None):
        self.item_ids = item_ids
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def _get_similar_items_to_user_profile(self, person_id, topn=1000):
        # 유저 특성과 항목 특성 사이의 코사인 유사도를 구한다
        cosine_similarities = cosine_similarity(user_profiles[person_id], tfidf_matrix)

        # 가장 유사한 항목을 찾는다
        similar_indices = cosine_similarities.argsort().flatten()[-topn:]
        
        # 유사도를 기준으로 유사한 항목을 정렬한다
        similar_items = sorted(
            [(item_ids[i], cosine_similarities[0, i]) for i in similar_indices],
            key=lambda x: -x[1]
        )
        
        return similar_items
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        similar_items = self._get_similar_items_to_user_profile(user_id)
        
        # 기존에 상호작용했던 항목은 제거한다
        similar_items_filtered = list(filter(lambda x: x[0] not in items_to_ignore, similar_items))
        
        recommendations = (
            pd.DataFrame(similar_items_filtered, columns=['contentId', 'recStrength'])
              .head(topn)
        )
        
        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrength', 'contentId', 'title', 'url', 'lang']]
            )
        
        return recommendations

In [27]:
content_based_model = ContentBasedRecommender(item_ids, articles_df)

In [28]:
print('콘텐츠 기반 추천 모형을 평가합니다')
cb_global_metrics, cb_detailed_results = model_evaluator.evaluate_model(content_based_model)
print('Global Metrics:\n{}'.format(cb_global_metrics))
cb_detailed_results.head(10)

콘텐츠 기반 추천 모형을 평가합니다
1139 users processed
Global Metrics:
{'model_name': 'Content-Based', 'recall@5': 0.3834057785732549, 'recall@10': 0.4945026847353618}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
76,17,29,192,0.088542,0.151042,3609194402293569455
17,23,32,134,0.171642,0.238806,-2626634673110551643
16,21,35,130,0.161538,0.269231,-1032019229384696495
10,37,57,117,0.316239,0.487179,-1443636648652872475
82,8,15,88,0.090909,0.170455,-2979881261169775358
161,16,25,80,0.2,0.3125,-3596626804281480007
65,10,16,73,0.136986,0.219178,1116121227607581999
81,11,20,69,0.15942,0.289855,692689608292948411
106,5,14,69,0.072464,0.202899,-9016528795238256703
52,5,10,68,0.073529,0.147059,3636910968448833585


### Collaborative Filtering Model

이제 마지막으로 유저간의 특성을 이용해서, 상품을 추천합니다. 유저와 가장 유사한 유저들을 골라서 가장 좋아할만한 상품을 추천할거에요.

In [29]:
users_items_pivot_df = (interaction_train
  .pivot(index='personId', columns='contentId', values='eventStrength')
  .fillna(0)
)
users_items_pivot_df.iloc[:5, :5]

contentId,-9222795471790223670,-9216926795620865886,-9194572880052200111,-9192549002213406534,-9190737901804729417
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-9223121837663643404,0.0,0.0,0.0,0.0,0.0
-9212075797126931087,0.0,0.0,0.0,0.0,0.0
-9207251133131336884,0.0,3.0,0.0,0.0,0.0
-9199575329909162940,0.0,0.0,0.0,0.0,0.0
-9196668942822132778,0.0,0.0,0.0,0.0,0.0


In [30]:
users_items_pivot_matrix = users_items_pivot_df.as_matrix()
users_items_pivot_matrix[:10]

  """Entry point for launching an IPython kernel.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 3., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [31]:
user_ids = list(users_items_pivot_df.index)
user_ids[:10]

[-9223121837663643404,
 -9212075797126931087,
 -9207251133131336884,
 -9199575329909162940,
 -9196668942822132778,
 -9188188261933657343,
 -9172914609055320039,
 -9156344805277471150,
 -9120685872592674274,
 -9109785559521267180]

#### 사실 위와 같이 유저간의 코사인 유사도를 통해서 추천 시스템을 구현할 수 있지만, 이번에는 svd 분해를 통해서 해보도록 하겠습니다.

https://darkpgmr.tistory.com/106

In [32]:
# User-Item matrix에서 요인의 개수를 정한다
NUMBER_OF_FACTORS_MF = 15

# User-Item Matrix을 분해한다
U, sigma, Vt = svds(users_items_pivot_matrix, k=NUMBER_OF_FACTORS_MF)

In [33]:
U.shape # (1140, 15)
Vt.shape # (15, 2926)

sigma_mat = np.diag(sigma)
sigma_mat.shape # (15, 15)

(15, 15)

In [34]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma_mat), Vt)
all_user_predicted_ratings.shape

(1140, 2926)

In [35]:
cf_preds_df = (
  pd.DataFrame(all_user_predicted_ratings, 
               columns=users_items_pivot_df.columns, 
               index=user_ids)
    .transpose()
)

In [36]:
cf_preds_df.iloc[:5, :5]

Unnamed: 0_level_0,-9223121837663643404,-9212075797126931087,-9207251133131336884,-9199575329909162940,-9196668942822132778
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-9222795471790223670,-0.003597,9.7e-05,-0.025275,0.049606,-0.014388
-9216926795620865886,0.001136,8.8e-05,0.004071,-0.00069,0.001928
-9194572880052200111,0.012088,0.003933,-0.013601,-0.006128,0.035771
-9192549002213406534,0.02834,-0.003469,-0.025762,-0.006675,0.011599
-9190737901804729417,0.015525,-0.001208,0.008202,0.001825,-0.0004


In [37]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
    
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # 사용자에 대한 예측값을 가져와서 정렬한다
        sorted_user_prediction = (self.cf_predictions_df
            .loc[:, user_id]
            .sort_values(ascending=False)
            .reset_index()
            .rename(columns={user_id: 'recStrength'})
        )
        
        recommendations = (sorted_user_prediction
            .loc[lambda d: ~d['contentId'].isin(items_to_ignore)]
            .sort_values('recStrength', ascending=False)
            .head(topn)
        )
        
        if verbose:
            if self.item_df is None:
                raise Exception('"items_df" is required in verbose mode')
            
            recommendations = (recommendations
                .merge(self.items_df, how='left', left_on='contentId', right_on='contentId')
                .loc[:, ['recStrength', 'contentId', 'title', 'url', 'lang']]
            )
            
        return recommendations

In [38]:
cf_recommender_model = CFRecommender(cf_preds_df, articles_df)

In [39]:
print('협업 필터링(SVD 행렬분해) 모형을 평가합니다')
cf_global_metrics, cf_detailed_results = model_evaluator.evaluate_model(cf_recommender_model)
print('Global Metrics:\n{}'.format(cf_global_metrics))
cf_detailed_results.head(10)

협업 필터링(SVD 행렬분해) 모형을 평가합니다
1139 users processed
Global Metrics:
{'model_name': 'Collaborative Filtering', 'recall@5': 0.2689849143441575, 'recall@10': 0.39682945538225517}


Unnamed: 0,hits@5_count,hits@10_count,interacted_count,recall@5,recall@10,_person_id
76,22,42,192,0.114583,0.21875,3609194402293569455
17,22,35,134,0.164179,0.261194,-2626634673110551643
16,26,36,130,0.2,0.276923,-1032019229384696495
10,33,48,117,0.282051,0.410256,-1443636648652872475
82,36,47,88,0.409091,0.534091,-2979881261169775358
161,17,28,80,0.2125,0.35,-3596626804281480007
65,15,25,73,0.205479,0.342466,1116121227607581999
81,14,21,69,0.202899,0.304348,692689608292948411
106,16,24,69,0.231884,0.347826,-9016528795238256703
52,17,29,68,0.25,0.426471,3636910968448833585


# 끝