# 추천 알고리즘 (Recommendation Algorithm)

In [22]:
from numpy import dot
from math import sqrt
from collections import defaultdict, Counter

In [23]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["Hadoop", "Big Data", "HBase", "Java"],
    ["Hadoop", "Big Data", "HBase", "Java", "Postgres", "pandas", "MySQL"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

## 1. 인기도를 활용한 추천

In [25]:
# 전체 인기순 구하기
popular_interests = Counter(interest
                            for user_interests in users_interests
                            for interest in user_interests).most_common()

In [26]:
# 유저가 관심사에 적지 않은 항목들을 전체 인기순으로 유저에게 추천
def most_popular_new_interests(user_interests, max_results=5):
    suggestions = [(interest, frequency)
                   for interest, frequency in popular_interests
                   if interest not in user_interests]
    return suggestions[:max_results]

In [27]:
if __name__ == "__main__":

    print("Popular Interests")
    print(popular_interests)
    print()

    print("Most Popular New Interests")
    print("already like:", ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"])
    print(most_popular_new_interests(["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"]))
    print()
    print("already like:", ["R", "Python", "statistics", "regression", "probability"])
    print(most_popular_new_interests(["R", "Python", "statistics", "regression", "probability"]))
    print()

Popular Interests
[('Big Data', 6), ('HBase', 6), ('Java', 6), ('Hadoop', 5), ('Python', 4), ('R', 4), ('Cassandra', 3), ('Postgres', 3), ('pandas', 3), ('statistics', 3), ('regression', 3), ('probability', 3), ('Spark', 2), ('Storm', 2), ('MySQL', 2), ('MongoDB', 2), ('scikit-learn', 2), ('statsmodels', 2), ('machine learning', 2), ('libsvm', 2), ('C++', 2), ('neural networks', 2), ('deep learning', 2), ('artificial intelligence', 2), ('NoSQL', 1), ('scipy', 1), ('numpy', 1), ('decision trees', 1), ('Haskell', 1), ('programming languages', 1), ('mathematics', 1), ('theory', 1), ('Mahout', 1), ('MapReduce', 1), ('databases', 1), ('support vector machines', 1)]

Most Popular New Interests
already like: ['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
[('Big Data', 6), ('Java', 6), ('Hadoop', 5), ('Python', 4), ('R', 4)]

already like: ['R', 'Python', 'statistics', 'regression', 'probability']
[('Big Data', 6), ('HBase', 6), ('Java', 6), ('Hadoop', 5), ('Cassandra', 3)]



## 2. 사용자 기반 협업 필터링 (user-based filtering)

특정 사용자 A와 유사한 다른 사용자 B를 찾은 후, B의 관심사를 추천

In [5]:
# user_interests의 항목을 unique한 리스트로 구하기
unique_interests = sorted(list({ interest
                                 for user_interests in users_interests
                                 for interest in user_interests }))

In [6]:
# user_interests의 항목을 unique와 비교하여 벡터로 만들기
def make_user_interest_vector(user_interests):
    """unique_interest[i]가 user_interests에 존재한다면
        i번째 요소가 1이고, 존재하지 않는다면 0인 벡터를 생성"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

# 모든 user_interests의 벡터 행렬
user_interest_matrix = list(map(make_user_interest_vector, users_interests))

In [7]:
# 벡터의 코사인 유사도 함수
def cosine_similarity(v, w):
    return dot(v, w) / sqrt(dot(v, v) * dot(w, w))

cosine similarity는 두 벡터 v, w 사이의 각도를 잰다. 완전히 같은 방향이면 1, 완전히 반대 방향이면 -1, v, w 중 하나가 0이면 0을 나타낸다.

In [8]:
# 모든 벡터 2개 조합의 코사인 유사도 구하기
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]

In [9]:
# user_id와 가장 유사한 other_user_id 구하기
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)
             for other_user_id, similarity in
                enumerate(user_similarities[user_id])         # user_id와 다른 모든 유저의 similarity
             if user_id != other_user_id and similarity > 0]  # id가 같지 않고, 유사도가 0이 아닌 것

    return sorted(pairs, key=lambda pair: pair[1], reverse=True)    # similarity가 높은 순서대로 출력

각각의 관심사에 대해 해당 관심사에 관심이 있는 다른 사용자와의 유사도를 모두 더해 새로운 관심사를 추천한다

In [10]:
def user_based_suggestions(user_id, include_current_interests=False):
    
    suggestions = defaultdict(float)
    
    for other_user_id, similarity in most_similar_users_to(user_id):
        for interest in users_interests[other_user_id]:
            suggestions[interest] += similarity    # 다른 유저의 관심사의 similarity를 모두 더한다

    # 정렬된 리스트로 변환
    suggestions = sorted(suggestions.items(), key=lambda pair: pair[1], reverse=True)

    
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]] # 자신의 관심사는 제외

In [11]:
if __name__ == "__main__":
    
    print("User based similarity")
    print("most similar to 0")
    print(most_similar_users_to(0))

    print("Suggestions for 0")
    print(user_based_suggestions(0))
    print()

User based similarity
most similar to 0
[(1, 1.0), (2, 0.7559289460184544), (3, 0.5714285714285714), (12, 0.5669467095138409), (4, 0.3380617018914066), (11, 0.1889822365046136), (16, 0.1690308509457033), (8, 0.1543033499620919)]
Suggestions for 0
[('Postgres', 1.0785211242656814), ('MySQL', 0.7404594223742746), ('pandas', 0.5714285714285714), ('MapReduce', 0.5669467095138409), ('MongoDB', 0.50709255283711), ('NoSQL', 0.3380617018914066), ('neural networks', 0.1889822365046136), ('deep learning', 0.1889822365046136), ('artificial intelligence', 0.1889822365046136), ('databases', 0.1690308509457033), ('Python', 0.1543033499620919), ('R', 0.1543033499620919), ('C++', 0.1543033499620919), ('Haskell', 0.1543033499620919), ('programming languages', 0.1543033499620919)]



차원이 아주 커지면, 대부분의 벡터는 서로 상당히 다른 방향을 가리키게 된다.<br/>
즉, 관심사의 수가 아주 많아지면 특정 사용자와 가장 유사한 사용자는 전혀 유사하지 않을 가능성이 있다.

## 3. 상품 기반 협업 필터링 (Item-Based Collaborative Filtering)

사용자의 현재 관심사와 가장 유사한 관심사들을 직접 추천

In [12]:
# user_interest_matrix의 전치행렬(transposed matrix) 구하기
interest_user_matrix = [[user_interest_vector[j]
                         for user_interest_vector in user_interest_matrix]
                        for j, _ in enumerate(unique_interests)]

In [13]:
# cosine similarity를 이용하여 interest 벡터의 유사도 구하기
# 완전히 동일한 유저의 집합이 두 관심사에 관심이 있으면 1, 두 관심사에 동일하게 관심을 갖는 유저가 없으면 0
interest_similarities = [[cosine_similarity(user_vector_i, user_vector_j)
                          for user_vector_j in interest_user_matrix]
                         for user_vector_i in interest_user_matrix]

In [14]:
def most_similar_interests_to(interest_id):
    
    """ interest_id와 similarity가 높은 other_interest_id 구하기 """
    
    similarities = interest_similarities[interest_id]
    
    pairs = [(unique_interests[other_interest_id], similarity)
             for other_interest_id, similarity in enumerate(similarities)
             if interest_id != other_interest_id and similarity > 0] # 같은 interest_id & similarity = 0 제외 
    
    return sorted(pairs, key=lambda pair: pair[1], reverse=True)

In [15]:
print(most_similar_interests_to(0))

[('Hadoop', 0.9128709291752769), ('Java', 0.8333333333333334), ('HBase', 0.6666666666666666), ('Spark', 0.5773502691896258), ('Storm', 0.5773502691896258), ('Cassandra', 0.47140452079103173), ('MapReduce', 0.4082482904638631), ('MySQL', 0.2886751345948129), ('artificial intelligence', 0.2886751345948129), ('deep learning', 0.2886751345948129), ('neural networks', 0.2886751345948129), ('Postgres', 0.23570226039551587), ('pandas', 0.23570226039551587)]


In [17]:
# 추천 목록은 사용자의 관심사와 유사한 관심사들의 유사도의 합으로 산출
def item_based_suggestions(user_id, include_current_interests=False):
    
    suggestions = defaultdict(float)
    
    user_interest_vector = user_interest_matrix[user_id]
    
    for interest_id, is_interested in enumerate(user_interest_vector):
        if is_interested == 1:
            similar_interests = most_similar_interests_to(interest_id)
            for interest, similarity in similar_interests:
                suggestions[interest] += similarity

    # 정렬된 리스트로 변환
    suggestions = sorted(suggestions.items(), key=lambda pair: pair[1], reverse=True)
    
    if include_current_interests:
        return suggestions
    else:
        return [(suggestion, weight)
                for suggestion, weight in suggestions
                if suggestion not in users_interests[user_id]] # 자신의 관심사는 제외

In [18]:
if __name__ == "__main__":

    print("Item based similarity")
    print("most similar to 'Big Data'")
    print(most_similar_interests_to(0))
    print()

    print("suggestions for user 0")
    print(item_based_suggestions(0))

Item based similarity
most similar to 'Big Data'
[('Hadoop', 0.9128709291752769), ('Java', 0.8333333333333334), ('HBase', 0.6666666666666666), ('Spark', 0.5773502691896258), ('Storm', 0.5773502691896258), ('Cassandra', 0.47140452079103173), ('MapReduce', 0.4082482904638631), ('MySQL', 0.2886751345948129), ('artificial intelligence', 0.2886751345948129), ('deep learning', 0.2886751345948129), ('neural networks', 0.2886751345948129), ('Postgres', 0.23570226039551587), ('pandas', 0.23570226039551587)]

suggestions for user 0
[('Postgres', 1.7700435250580735), ('MySQL', 1.4709283043960897), ('MapReduce', 1.2637101764276841), ('NoSQL', 0.9855985596534889), ('MongoDB', 0.9855985596534889), ('pandas', 0.9653056709337087), ('databases', 0.4082482904638631), ('Haskell', 0.4082482904638631), ('programming languages', 0.4082482904638631), ('artificial intelligence', 0.2886751345948129), ('deep learning', 0.2886751345948129), ('neural networks', 0.2886751345948129), ('C++', 0.2886751345948129), ('