In [1]:
import random
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
genders = ['남성', '여성']
pets = ['강아지', '고양이', '없음']
smokes = [True, False]
personalities = ['외향적', '내향적', '낙관적', '비관적', '사교적', '내성적', '활동적', '수동적',
                '유머 감각 있음', '진지한', '지적', '감성적', '예술적', '실용적', '책임감 있음',
                '자유로운', '독립적', '의존적인']

def generate_person():
  return {
      '성별': random.choice(genders),
      '나이': random.randint(20, 60),
      '성격': sorted(random.sample(personalities, random.randint(1, len(personalities)))),
      '애완동물': random.choice(pets),
      '흡연여부': random.choice(smokes)
  }

In [24]:
def calculate_weighted_cosine_similarity(data1, data2, weights=[]):
  """
  각 벡터의 요소에 가중치를 계산한 후에, 코사인 유사도를 구한다.
  만약, 가중치 값이 주어지지 않는다면 모든 값을 1로 설정하고,
  길이가 다르다면 에러를 던진다.
  """

  if len(weights) == 0:
    weights = np.ones_like(data1)
  elif len(weights) != len(data1):
    raise Exception("가중치와 데이터의 크기가 일치하지 않습니다.")

  weighted_data1 = data1.multiply(weights, axis=1)
  weighted_data2 = data2.multiply(weights, axis=1)

  weighted_data1_magnitude1 = np.sqrt((weighted_data1 ** 2).sum(axis=1))
  weighted_data2_magnitude2 = np.sqrt((weighted_data2 ** 2).sum(axis=1))

  result = (weighted_data1.dot(weighted_data2.T) / (weighted_data1_magnitude1.values.reshape(-1, 1) * weighted_data2_magnitude2.values.reshape(1, -1)))
  return result.to_numpy()

In [25]:
user_data = [generate_person() for _ in range(30)]
user_post_data = [generate_person() for _ in range(30)]

In [26]:
columns = ['애완동물', '흡연여부', '성격']

user_data_df = pd.DataFrame(user_data, columns=columns)
user_post_data_df = pd.DataFrame(user_post_data, columns=columns)

for personality in personalities:
  mask = user_data_df['성격'].apply(lambda p: personality in p).astype(int)
  user_data_df[personality] = mask

  mask = user_post_data_df['성격'].apply(lambda p: personality in p).astype(int)
  user_post_data_df[personality] = mask

user_data_df.drop('성격', axis=1, inplace=True)
user_post_data_df.drop('성격', axis=1, inplace=True)

user_data_df['흡연여부'] = user_data_df['흡연여부'].astype(int)
user_post_data_df['흡연여부'] = user_post_data_df['흡연여부'].astype(int)

for pet in pets:
  mask = user_data_df['애완동물'].apply(lambda p: p == pet).astype(int)
  user_data_df[pet] = mask

  mask = user_post_data_df['애완동물'].apply(lambda p: p == pet).astype(int)
  user_post_data_df[pet] = mask

user_data_df.drop('애완동물', axis=1, inplace=True)
user_post_data_df.drop('애완동물', axis=1, inplace=True)

result = cosine_similarity(user_data_df, user_post_data_df)
result

array([[0.61237244, 0.91925472, 0.88388348, 0.40824829, 0.63960215,
        0.74535599, 0.97332853, 0.78446454, 0.13608276, 0.69293487,
        0.82495791, 0.27216553, 0.13608276, 0.31622777, 0.62853936,
        0.75592895, 0.84983659, 0.27216553, 0.53452248, 0.31622777,
        0.85201287, 0.23570226, 0.76603235, 0.88388348, 0.80032673,
        0.73029674, 0.82495791, 0.75592895, 0.54997194, 0.5       ],
       [0.30618622, 0.64888568, 0.53033009, 0.        , 0.85280287,
        0.2236068 , 0.56777497, 0.39223227, 0.20412415, 0.56694671,
        0.53033009, 0.20412415, 0.40824829, 0.31622777, 0.35355339,
        0.66143783, 0.49029034, 0.        , 0.26726124, 0.15811388,
        0.54772256, 0.1767767 , 0.61871843, 0.53033009, 0.60024505,
        0.54772256, 0.53033009, 0.37796447, 0.58925565, 0.5       ],
       [0.51639778, 0.30779351, 0.4472136 , 0.        , 0.40451992,
        0.14142136, 0.30779351, 0.12403473, 0.51639778, 0.47809144,
        0.55901699, 0.        , 0.25819889, 0.

In [27]:
def must_similar_age_range(data1, data2):
  return abs(data1['나이'] - data2['나이']) <= 10


def must_same_gender(data1, data2):
  return data1['성별'] == data2['성별']


def must_smoker(data1, data2):
  return data1['흡연여부'] == data2['흡연여부']


def must_same_pet(data1, data2):
  """애완 동물을 안키운다면, 안키우는 사람끼리, 키운다면 키우는 사람끼리"""
  if data1['애완동물'] == '없음' and data2['애완동물'] == '없음':
    return True
  if data1['애완동물'] != '없음' and data2['애완동물'] != '없음':
    return True
  return False


conditions = [
    must_same_gender,
    must_smoker,
    must_same_pet,
    must_similar_age_range
]


# 사용자 0번에게 추천되는 글 목록들
post_for_user0 = sorted(enumerate(result[0]), key=lambda x: x[1], reverse=True)

print('######### USER')
print(user_data[0])
for index, score in post_for_user0:
  for condition in conditions:
    if not condition(user_data[0], user_post_data[index]):
      break
  else:
    print(f'{score:.5f},', user_post_data[index])

# 글 0번에게 추천되는 사용자 목록들
user_for_post0 = sorted(enumerate(result[:, 0]), key=lambda x: x[1], reverse=True)

print()
print('######### POST')
print(user_post_data[0])
for index, score in user_for_post0:
  for condition in conditions:
    if not condition(user_post_data[0], user_data[index]):
      break
  else:
    print(f'{score:.5f},', user_data[index])


######### USER
{'성별': '여성', '나이': 31, '성격': ['감성적', '낙관적', '내성적', '내향적', '비관적', '사교적', '수동적', '실용적', '예술적', '외향적', '유머 감각 있음', '의존적인', '자유로운', '지적', '진지한', '책임감 있음', '활동적'], '애완동물': '강아지', '흡연여부': False}
0.74536, {'성별': '여성', '나이': 26, '성격': ['낙관적', '내향적', '사교적', '실용적', '외향적', '유머 감각 있음', '의존적인', '자유로운', '활동적'], '애완동물': '강아지', '흡연여부': False}

######### POST
{'성별': '남성', '나이': 22, '성격': ['감성적', '내성적', '내향적', '독립적', '수동적', '예술적', '유머 감각 있음', '자유로운', '지적', '활동적'], '애완동물': '고양이', '흡연여부': True}


In [28]:
# 만약, 데이터베이스에서 없는 사람이 등장하면?

new_user = generate_person()
new_user_df = pd.DataFrame([new_user], columns=columns)

for personality in personalities:
  mask = new_user_df['성격'].apply(lambda p: personality in p).astype(int)
  new_user_df[personality] = mask

for pet in pets:
  mask = new_user_df['애완동물'].apply(lambda p: p == pet).astype(int)
  new_user_df[pet] = mask

new_user_df.drop('성격', axis=1, inplace=True)
new_user_df['흡연여부'] = new_user_df['흡연여부'].astype(int)

new_user_df.drop('애완동물', axis=1, inplace=True)

similar_users = cosine_similarity(new_user_df, user_pd_data)[0]
similar_users = sorted(enumerate(similar_users), key=lambda x: x[1], reverse=True)
similar_users = list(filter(lambda x: user_data[x[0]]['성별'] == new_user['성별'], similar_users))
similar_user = similar_users[0] if len(similar_users) else None

# 가장 비슷한 사람을 찾아주고 그 사람으로부터 추천 결과를 보여줌.
print(f'{new_user=}', f'{user_data[similar_user[0]]=}', sep='\n')
post_for_new_user = sorted(enumerate(result[similar_user[0]]), key=lambda x: x[1], reverse=True)
for index, score in post_for_new_user:
  for condition in conditions:
    if not condition(user_post_data[0], user_data[index]):
      break
  else:
    print(f'{score:.5f},', user_data[index])

new_user={'성별': '여성', '나이': 23, '성격': ['감성적', '낙관적', '내성적', '내향적', '독립적', '비관적', '사교적', '수동적', '실용적', '외향적', '유머 감각 있음', '의존적인', '자유로운', '지적', '진지한', '책임감 있음', '활동적'], '애완동물': '강아지', '흡연여부': False}
user_data[similar_user[0]]={'성별': '여성', '나이': 34, '성격': ['낙관적', '내향적', '독립적', '사교적', '예술적', '외향적'], '애완동물': '없음', '흡연여부': True}
