In [38]:
users_interests = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["Hadoop", "Big Data", "HBase", "Java"],
    ["Hadoop", "Big Data", "HBase", "Java", "Postgres", "pandas", "MySQL"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [39]:
unique_interests = sorted(list({ interest
                                 for user_interests in users_interests
                                 for interest in user_interests }))
unique_interests

['Big Data',
 'C++',
 'Cassandra',
 'HBase',
 'Hadoop',
 'Haskell',
 'Java',
 'Mahout',
 'MapReduce',
 'MongoDB',
 'MySQL',
 'NoSQL',
 'Postgres',
 'Python',
 'R',
 'Spark',
 'Storm',
 'artificial intelligence',
 'databases',
 'decision trees',
 'deep learning',
 'libsvm',
 'machine learning',
 'mathematics',
 'neural networks',
 'numpy',
 'pandas',
 'probability',
 'programming languages',
 'regression',
 'scikit-learn',
 'scipy',
 'statistics',
 'statsmodels',
 'support vector machines',
 'theory']

In [40]:
def make_user_interest_vector(user_interests):
    """unique_interest[i]가 user_interests에 존재한다면
        i번째 요소가 1이고, 존재하지 않는다면 0인 벡터를 생성"""
    return [1 if interest in user_interests else 0
            for interest in unique_interests]

# 사용자 관심사에 대한 행렬
user_interest_matrix = list(map(make_user_interest_vector, users_interests))

In [41]:
user_interest_matrix[0]

[1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [42]:
from numpy import dot
from math import sqrt

def cosine_similarity(v, w):
    return dot(v, w) / sqrt(dot(v, v) * dot(w, w))

cosine_similarity([0, 1, 0], [1, 1, 0])

0.7071067811865475

In [43]:
user_similarities = [[cosine_similarity(interest_vector_i, interest_vector_j)
                      for interest_vector_j in user_interest_matrix]
                     for interest_vector_i in user_interest_matrix]
user_similarities[0][9]

0.0

In [44]:
_user_similarities = cosine_similarity(user_interest_matrix[0], user_interest_matrix[9])
_user_similarities

0.0

In [45]:
def most_similar_users_to(user_id):
    pairs = [(other_user_id, similarity)                      # find other
             for other_user_id, similarity in                 # users with
                enumerate(user_similarities[user_id])         # nonzero
             if user_id != other_user_id and similarity > 0]  # similarity

    return sorted(pairs, key=lambda pair: pair[1], reverse=True)   

In [46]:
most_similar_users_to(0)

[(1, 1.0),
 (2, 0.7559289460184544),
 (3, 0.5714285714285714),
 (12, 0.5669467095138409),
 (4, 0.3380617018914066),
 (11, 0.1889822365046136),
 (16, 0.1690308509457033),
 (8, 0.1543033499620919)]