In [1]:
import numpy as np
import pandas as pd

In [2]:
vector_1 = np.array([1,2,3,4,5])
vector_2 = np.array([5,4,3,2,1])
vector_3 = np.array([2,3,4,5,6])

In [3]:
print((vector_1 - vector_2))
print((vector_1 - vector_2) ** 2)
print(sum((vector_1 - vector_2) ** 2))
print(np.sqrt(sum((vector_1 - vector_2) ** 2)))

[-4 -2  0  2  4]
[16  4  0  4 16]
40
6.32455532034


In [4]:
from scipy import spatial

In [5]:
vector_1 = np.array([1, 2, 3, 4, 5])
vector_2 = np.array([5, 4, 3, 2, 1])
vector_3 = np.array([11, 19, 28, 32, 47])

In [6]:
print(sum(vector_1 * vector_2))
print(np.dot(vector_1, vector_2))

35
35


In [7]:
columns = ["article_1","article_2","article_3","article_4","article_5"]
index = ["user_1", "user_2", "user_3", "user_4"]

data = np.array([
    [5,3,0,0,2],
    [2,0,0,1,4],
    [0,0,4,3,1],
    [4,0,4,5,0],
])

sample_df = pd.DataFrame(data, columns=columns, index=index)
sample_df


Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_1,5,3,0,0,2
user_2,2,0,0,1,4
user_3,0,0,4,3,1
user_4,4,0,4,5,0


In [8]:
def cosine_smimilarity(vector_1, vector_2):

    # vector_1 데이터가 0인 index를 제거
    idx = vector_1.nonzero()[0] # vector에서 value가 0이 아닌 index를 구함
    # index 값으로 vector의 요소를 필터링 함
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
   
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [16]:
def similarity_matrix(sample_df, similarity_func):

    # index 데이터 저장
    index = sample_df.index
    
    # 데이터 프레임 전치 (index - article, columns - user)
    df = sample_df.T
    
    # 모든 user 데이터 사이의 유사도를 구해 행렬 생성 
    matrix = []
    for idx_1, value_1 in df.items():
        # row 데이터 저장
        row = []
        for idx_2, value_2 in df.items():
                        # 두 user 사이의 유사도 구함
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, columns=index, index=index)

In [17]:
sm_df = similarity_matrix(sample_df, cosine_smimilarity)
sm_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_1,1.0,0.652929,0.324443,0.811107
user_2,0.729397,1.0,0.483046,0.443039
user_3,0.196116,0.332956,1.0,0.949474
user_4,0.529813,0.770054,0.82121,1.0


In [19]:
user, closer_count = "user_1", 2

# 본인 데이터 제거
ms_df = sm_df.drop(user)

# 유사도가 높은 순으로 sorting
ms_df = ms_df.sort_values(user, ascending=False)

# 위의 설정 대로 컨텐츠를 추천할 사용자와 유사도가 높은 사용자 필터링
ms_df = ms_df[:closer_count]

ms_df


Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_4,0.529813,0.770054,0.82121,1.0


In [14]:
sample_df.loc[ms_df.index]


Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_2,2,0,0,1,4
user_4,4,0,4,5,0


In [21]:
user, closer_count = "user_1", 2

# 본인 데이터 제거
ms_df = sm_df.drop(user)

# 유사도가 높은 순으로 sorting
ms_df = ms_df.sort_values(user, ascending=False)

# 위의 설정 대로 컨텐츠를 추천할 사용자와 유사도가 높은 사용자 필터링
ms_df = ms_df[:closer_count]
ms_df

Unnamed: 0,user_1,user_2,user_3,user_4
user_2,0.729397,1.0,0.483046,0.443039
user_4,0.529813,0.770054,0.82121,1.0


In [22]:
sample_df.loc[ms_df.index]

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user_2,2,0,0,1,4
user_4,4,0,4,5,0


In [23]:
mean = np.zeros(len(sample_df.columns))
for ms_user, sms_value in ms_df[user].items():
    mean += sample_df.loc[ms_user]
mean /= len(ms_df[user])

pred_df = pd.DataFrame(columns=sample_df.columns)
pred_df.loc["user"] = sample_df.loc[user]
pred_df.loc["mean"] = mean
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [24]:
def mean_score(sample_df, sm_df, target, closer_count):
    
    # 유사도 행렬에서 추천 user와 가까운 user의 유사도 데이터 프레임
    ms_df = sm_df.drop(target)
    ms_df = ms_df.sort_values(target, ascending=False)
    ms_df = ms_df[target][:closer_count]
    
    # 유사도가 높은 user를 나타내는 데이터 프레임
    ms_df = sample_df.loc[ms_df.index]
   
    # 결과 데이터 프레임 생성
    pred_df = pd.DataFrame(columns=sample_df.columns)
    pred_df.loc["user"] = sample_df.loc[target]
    pred_df.loc["mean"] = ms_df.mean()
    
    return pred_df

In [25]:
target, closer_count = "user_1", 2
pred_df = mean_score(sample_df, sm_df, target, closer_count)
pred_df

Unnamed: 0,article_1,article_2,article_3,article_4,article_5
user,5,3,0,0,2
mean,3,0,2,3,2


In [26]:
recommand_df = pred_df.T
recommand_df = recommand_df[recommand_df["user"] == 0]
recommand_df = recommand_df.sort_values("mean", ascending=False)
print(list(recommand_df.index))
recommand_df

['article_4', 'article_3']


Unnamed: 0,user,mean
article_4,0,3
article_3,0,2
