In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import zipfile
from PIL import Image
from sklearn.metrics import pairwise_distances
from sklearn.metrics import jaccard_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from torchvision import models, transforms
import torch
import torchvision.models as models
import torch.nn as nn
import warnings
from sklearn.metrics.pairwise import cosine_similarity
from io import BytesIO
from tqdm import tqdm
import itertools
import random


pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)
pd.set_option('display.max_columns', 25)

# **미션 3**

## **미션 3-1**
- 2-1에서 구한 유효한 라벨링 데이터만 따로 분리하여 100명 응답자의 “스타일 선호 정보표”를 구한다.

협업 필터링이란 기존에 가지고 있는 데이터를 평가하여 새로운 데이터에 대해 유사한 user, item을 기반으로 추천을 해주는 방법이다. 이때 사용자 기반 협업 필터링은 새로운 평가를 내리려는 사용자와 유사한 사용자의 아이템 평가를 통해서 추천하는 방법이다. 사용자-아이템 상호작용 행렬이 있다고 할 때 row를 기준으로 평가를 진행한다. 아이템 기반 협업 필터링은 새로운 평가를 내리려는 아이템과 유사한 아이템의 사용자 평가를 통해서 추천하는 방법이다.

### **1) 스타일 선호 정보표를 이용한 유틸리티 행렬 계산**

In [3]:
# 스타일 선호 정보표 불러오기
df = pd.read_csv('/content/drive/MyDrive/DCC/DCC_스타일_선호_정보표.csv')
df.head()

Unnamed: 0,id,스타일 선호_x,스타일 비선호_x,스타일 선호_y,스타일 비선호_y
0,64747.0,W_03194_50_classic_W.jpg,W_02247_50_classic_W.jpg,W_20598_70_military_W.jpg,W_02498_50_feminine_W.jpg
1,64747.0,W_30434_60_minimal_W.jpg,W_02498_50_feminine_W.jpg,W_37491_70_military_W.jpg,W_14102_50_feminine_W.jpg
2,64747.0,W_30454_60_minimal_W.jpg,W_13904_50_feminine_W.jpg,W_22510_80_powersuit_W.jpg,W_27828_60_minimal_W.jpg
3,64747.0,W_35674_60_minimal_W.jpg,W_14102_50_feminine_W.jpg,W_46907_80_powersuit_W.jpg,W_47169_70_hippie_W.jpg
4,64747.0,W_20598_70_military_W.jpg,W_18951_50_feminine_W.jpg,W_30988_90_kitsch_W.jpg,W_11610_90_grunge_W.jpg


In [4]:
def imageID(x):
    if pd.isna(x): return x
    return x[2:7]

df_t = df.copy()
df_t[['스타일 선호_x', '스타일 비선호_x', '스타일 선호_y', '스타일 비선호_y']] = df_t[['스타일 선호_x', '스타일 비선호_x', '스타일 선호_y', '스타일 비선호_y']].applymap(imageID)

In [5]:
train = df_t[['id','스타일 선호_x','스타일 비선호_x']]
valid = df_t[['id','스타일 선호_y','스타일 비선호_y']]

print(f'훈련 데이터에 존재하는 고유한 이미지 종류 : {train["스타일 선호_x"].nunique()}')
print(f'훈련 데이터에 존재하는 고유한 응답자 수 : {train.id.nunique()}')

train['스타일 선호_x'] = train['스타일 선호_x'].fillna(0).astype(str)
train['스타일 비선호_x'] = train['스타일 비선호_x'].fillna(0).astype(str)

#**선호 응답 데이터 이진화**#
pref_train = train[['id', '스타일 선호_x']].rename(columns={'스타일 선호_x': 'item'})
pref_train['prefer'] = 1

not_pref_train = train[['id', '스타일 비선호_x']].rename(columns={'스타일 비선호_x': 'item'})
not_pref_train['prefer'] = 0

훈련 데이터에 존재하는 고유한 이미지 종류 : 1730
훈련 데이터에 존재하는 고유한 응답자 수 : 100


In [6]:
# 선호_x에는 1을 배정
pref_train = train[['id', '스타일 선호_x']].rename(columns={'스타일 선호_x': 'item'})
pref_train['prefer'] = 1

# 선호_x에는 0을 배정
not_pref_train = train[['id', '스타일 비선호_x']].rename(columns={'스타일 비선호_x': 'item'})
not_pref_train['prefer'] = 0

# 각 user, item별로 선호, 비선호 결합
format_data = pd.concat([pref_train, not_pref_train]).reset_index(drop=True)

# 유틸리티 행렬을 계산하고 결측치는 0으로 대체
ut_matrix = format_data.pivot_table(index='id', columns='item', values='prefer', fill_value = 0)
ut_matrix_fill_zero = ut_matrix.drop(columns='0')
ut_matrix_fill_zero

item,00004,00007,00017,00023,00026,00027,00028,00031,00032,00034,00036,00038,...,67909,67958,68175,68199,71920,71921,71922,71923,71933,71934,71935,71936
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
368.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
837.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7658.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7905.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9096.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66469.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66513.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66592.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
66731.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **2) Jaccard 유사도 행렬 계산**

In [7]:
# 자카드 유사도 계산
user_sim_ = 1 - pairwise_distances(np.array((ut_matrix_fill_zero)), metric='jaccard')
user_sim = pd.DataFrame(user_sim_, index=ut_matrix_fill_zero.index, columns=ut_matrix_fill_zero.index)

item_sim_ = 1 - pairwise_distances(np.array((ut_matrix_fill_zero.T)), metric='jaccard')
item_sim = pd.DataFrame(item_sim_, index = ut_matrix_fill_zero.columns, columns = ut_matrix_fill_zero.columns)

# 아이템간의 자카드 유사도 행렬
print(user_sim.shape) # 100 X 100
print(item_sim.shape) # 4066 X 4066



(100, 100)
(4066, 4066)


### **3) 단일 사용자-아이템 쌍에 대한 협업 필터링 수행**

In [8]:
def predict_user_preference(user_id, item_id, k=3, th=0.5):
  '''
  사용자의 자카드 유사도 행렬에서 입력한 user_id와 유사한 k개의 user_id 선택
  선택된 user_id를 이용해서 입력한 item_id의 선호도 평점 예측
  지정한 threshold보다 높으면 선호, 낮으면 비선호로 예측한다.

  user_id : 사용자 식별 id
  item_id : 아이템 식별 id
  k : 유사도 선택 개수
  th : 선호도 판단 기준

  ut_matrix_fill_zero : 유틸리티 행렬
  user_sim : 사용자 유사도 행렬
  similar_users : 유사도가 가장 높은 k명의 유저
  '''
  us = user_sim[user_id]
  similar_users = us.drop(index=user_id).nlargest(k).index # 자기 자신과의 유사도는 제외
  pref = ut_matrix_fill_zero.loc[similar_users, str(item_id)]
  avg_pref = pref.mean() # 유사도가 가장 높은 k user의 아이템 선호도의 평균

  prediction = 1 if avg_pref >= th else 0

  # 추천의 근거
  print(f'사용자 기반 추천의 근거 : 유사도가 높은 {similar_users.values}의 응답이 반영')

  return prediction

def predict_item_preference(user_id, item_id, k=3, th=0.5):
  '''
  아이템의 자카드 유사도 행렬에서 입력한 item_id와 유저가 평가한 유사한 k개의 item_id 선택
  선택된 kr개의 item_id에 대한 선호도 평가를 이용해서 입력한 item_id의 선호도 평점 예측
  지정한 threshold보다 높으면 선호, 낮으면 비선호로 예측한다.

  user_id : 사용자 식별 id
  item_id : 아이템 식별 id
  k : 유사도 선택 개수
  th : 선호도 판단 기준

  ut_matrix_fill_zero : 유틸리티 행렬
  item_sim : 아이템 유사도 행렬
  similar_items : 유사도가 가장 높은 k명의 아이템
  '''
  item_simil = item_sim[item_id]
  similar_items = item_simil.drop(index=item_id).nlargest(k).index
  pref = ut_matrix_fill_zero.loc[user_id, similar_items]
  avg_pref = pref.mean()

  prediction = 1 if avg_pref >= th else 0

  # 추천의 근거
  print(f'아이템 기반 추천의 근거 : 유사도가 높은 {similar_items.values}에 대한 선호도가 반영')

  return prediction

In [9]:
## user_id가 837인 사용자의 item_id : 68175 선호도 예측
user_preds = predict_user_preference(user_id=837, item_id='68175', k=5, th=0.5)
print(f'사용자 기반 협업 필터링 결과 : {user_preds}')

item_preds = predict_item_preference(user_id=837, item_id='68175', k=3, th=0.5)
print(f'아이템 기반 협업 필터링 결과 : {item_preds}')

사용자 기반 추천의 근거 : 유사도가 높은 [60184. 28371. 62349. 63748. 64345.]의 응답이 반영
사용자 기반 협업 필터링 결과 : 0
아이템 기반 추천의 근거 : 유사도가 높은 ['01933' '08913' '13018']에 대한 선호도가 반영
아이템 기반 협업 필터링 결과 : 0


이처럼 사용자 기반 협업 필터링은 추천의 이유를 익명화된 사용자의 응답을 바탕으로 제공하기 때문에 추천에 대한 신뢰도가 상대적으로 낮다. 하지만 아이템 기반 협업 필터링은 아이템이 가지는 특징을 기반으로 추천을 제시하고 유사도가 높은 아이템을 확인할 수 있기 때문에 신뢰도가 상대적으로 높다는 장점이 존재한다.

### **4) 사용자/아이템 기반 협업 필터링 수행**

In [10]:
# 훈련 데이터와 동일한 전처리 진행
valid['스타일 선호_y'] = valid['스타일 선호_y'].fillna(0).astype(str)
valid['스타일 비선호_y'] = valid['스타일 비선호_y'].fillna(0).astype(str)

pref_valid = valid[['id', '스타일 선호_y']].rename(columns={'스타일 선호_y': 'item'})
pref_valid['prefer'] = 1
pref_v = pref_valid[pref_valid['item'] != '0']

not_pref_valid = valid[['id', '스타일 비선호_y']].rename(columns={'스타일 비선호_y': 'item'})
not_pref_valid['prefer'] = 0
not_pref_v = not_pref_valid[not_pref_valid['item'] != '0']

# id와 item 관계를 딕셔너리로 표현
pref_dict = pref_v.groupby('id')['item'].apply(list).to_dict()
not_pref_dict = not_pref_v.groupby('id')['item'].apply(list).to_dict()

선호하는 아이템 중에서 예측 결과 확인 - True 값이 모두 1

In [11]:
us_cnt, it_cnt = [],[]

missing_keys = []
missing_values = []

for key,value in pref_dict.items():
  for id in value:
    try:
      user_preds = predict_user_preference(user_id=key, item_id=id, k=3, th=0.5)
      item_preds = predict_item_preference(user_id=key, item_id=id, k=3, th=0.5)
    except KeyError:
      missing_keys.append(key)
      missing_values.append(id)
      continue
    us_cnt.append(user_preds)
    it_cnt.append(item_preds)

us_result = sum(us_cnt)/(pref_v.shape[0] - len(missing_values)) * 100
it_result = sum(it_cnt)/(pref_v.shape[0] - len(missing_values)) * 100

sol = [1] * (pref_v.shape[0] - len(missing_values))

사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00799' '00886' '01703']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00799' '00886' '02804']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00495' '00500' '00540']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['28133' '00031' '00799']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [60184. 28371. 62349.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['01520' '01568' '01706']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [60184. 28371. 62349.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00811' '01520' '01568']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [60184. 28371. 62349.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00843' '01520' '01568']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [59637.   368.   837.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['04128' '06118' '06322']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [63505.   368.   837.]의 응답이 반영
아이템 기반 추천

비선호하는 아이템 중에서 예측 결과 확인 - True 값이 모두 0

In [12]:
us_cnt_v, it_cnt_v = [], []

missing_keys_v = []
missing_values_v = []

for key,value in not_pref_dict.items():
  for id in value:

    try:
      user_preds = predict_user_preference(user_id=key, item_id=id, k=3, th=0.5)
      item_preds = predict_item_preference(user_id=key, item_id=id, k=3, th=0.5)
    except KeyError:
      missing_keys_v.append(key)
      missing_values_v.append(id)
      continue
    us_cnt_v.append(user_preds)
    it_cnt_v.append(item_preds)

us_result_v = sum(us_cnt_v)/(not_pref_v.shape[0] - len(missing_values_v)) * 100
it_result_v = sum(it_cnt_v)/(not_pref_v.shape[0] - len(missing_values_v)) * 100

sol_v = [0] * (not_pref_v.shape[0] - len(missing_values_v))

사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['01532' '01658' '01804']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [63571. 28571. 63910.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [60184. 28371. 62349.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [60184. 28371. 62349.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [59637.   368.   837.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [59637.   368.   837.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [59637.   368.   837.]의 응답이 반영
아이템 기반 추천의 근거 : 유사도가 높은 ['00026' '00034' '00036']에 대한 선호도가 반영
사용자 기반 추천의 근거 : 유사도가 높은 [59637.   368.   837.]의 응답이 반영
아이템 기반 추천

### **5) 최종 결과 확인 (훈련 데이터의 사용자가 응답하지 않은 아이템에 대해선 선호도를 예측할 수 없음)**

In [13]:
us_prediction = us_cnt + us_cnt_v
it_prediction = it_cnt + it_cnt_v
solution = sol + sol_v

print(f' 사용자 기반 혼동 행렬 \n {confusion_matrix(solution, us_prediction)}')
print(f' 아이템 기반 혼동 행렬 \n {confusion_matrix(solution, it_prediction)}\n')

print(f' 사용자 기반 정확도 : {accuracy_score(solution, us_prediction)}')
print(f' 아이템 기반 정확도 : {accuracy_score(solution, it_prediction)}\n')

print(f' 사용자 기반 재현율 : {recall_score(solution, us_prediction)}')
print(f' 아이템 기반 재현율 : {recall_score(solution, it_prediction)}\n')

print(f' 사용자 기반 정밀도 : {precision_score(solution, us_prediction)}')
print(f' 아이템 기반 정밀도 : {precision_score(solution, it_prediction)}\n')

print(f' 사용자 기반 F1 스코어 : {f1_score(solution, us_prediction)}')
print(f' 아이템 기반 F1 스코어 : {f1_score(solution, it_prediction)}\n')

print(f'사용자 기반 협업 필터링 누락 값 {missing_values}')

 사용자 기반 혼동 행렬 
 [[402   0]
 [241   6]]
 아이템 기반 혼동 행렬 
 [[401   1]
 [ 29 218]]

 사용자 기반 정확도 : 0.6286594761171033
 아이템 기반 정확도 : 0.9537750385208013

 사용자 기반 재현율 : 0.024291497975708502
 아이템 기반 재현율 : 0.8825910931174089

 사용자 기반 정밀도 : 1.0
 아이템 기반 정밀도 : 0.9954337899543378

 사용자 기반 F1 스코어 : 0.04743083003952569
 아이템 기반 F1 스코어 : 0.9356223175965666

사용자 기반 협업 필터링 누락 값 ['06864', '00551', '10104', '06590', '04927', '09731', '32034', '14783', '18714', '19205', '06522', '00931', '07120', '24770', '38585', '57473', '50836', '26393', '16375', '10103', '00851', '15758', '02705', '19520', '00716', '14785', '06190', '04629', '09285', '16466', '05851', '09750', '11778', '01294', '14572', '08977', '13251', '05917', '08492', '07643', '02394', '05716', '14706', '15244', '06609', '02770', '12789', '00842', '02908', '09085', '08951', '01387', '10184', '29347', '28373', '34504', '44789', '47408', '62253', '17235', '09277', '03003', '00012', '06358', '10251', '09747', '17004', '00804', '09278', '09889', '10167', 

아이템 기반 협업 필터링은 선호도를 예측하고자 하는 사용자의 응답을 반영하기 때문에 사용자 기반 협업 필터링보다 선호도를 더 정확하게 예측할 가능성이 높다.

현재 보유한 데이터는 응답자 수가 100명이고 응답한 아이템의 종류는 약 4000개 이다. 동일한 스타일 클래스로 아이템을 구분하지 않고 독립적인 아이템 id를 기준으로 구분하기 때문에 사용자의 응답에 따른 사용자 유사도를 정확하게 계산하기 구조이다. 하지만 상대적으로 아이템과 관련된 데이터는 많고 아이템 기반 협업 필터링이 가지는 장점 덕분에 아이템 기반 협업 필터링이 사용자 기반 협업 필터링 보다 우수한 성능을 보였다.

## **미션 3-2**

In [14]:
# train, valid 데이터 불러오기
df_train = pd.read_csv('/content/drive/MyDrive/DCC/df_train_response.csv')
df_valid = pd.read_csv('/content/drive/MyDrive/DCC/df_valid_response.csv')

### Rating Matrix 생성 & Valid data

#### Q1, Q5의 조합을 이용해서 응답 결과 수정

In [15]:
#train, valid 데이터에 대하여 설문 응답 결과 중 사용할 결과 데이터 불러오기

df_train_t = df_train[['E_id','R_id','imgName', 'era', 'style', 'gender', 'Q1', 'Q2', 'Q3', 'Q411', 'Q412', 'Q413', 'Q414', 'Q4201',
       'Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209',
       'Q4210', 'Q4211', 'Q4212', 'Q4213', 'Q4214', 'Q4215', 'Q4216', 'Q5']]
df_valid_t = df_valid[['E_id','R_id','imgName', 'era', 'style', 'gender', 'Q1', 'Q2', 'Q3', 'Q411', 'Q412', 'Q413', 'Q414', 'Q4201',
       'Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209',
       'Q4210', 'Q4211', 'Q4212', 'Q4213', 'Q4214', 'Q4215', 'Q4216', 'Q5']]

df_train_t['img_id'] = df_train_t['imgName'].str[2:7]
df_valid_t['img_id'] = df_valid_t['imgName'].str[2:7]

In [16]:
# 이미지에 대한 선호 여부 관련 데이터 불러오기
df_train_t1 = df_train_t[['img_id','R_id','Q1','Q5']]
df_valid_t1 = df_valid_t[['img_id','R_id','Q1','Q5']]

In [17]:
def compare_values(row):
  '''
  Q1과 Q5의 응답 결과를 조합하여 응답 결과의 강도를 조절하는 함수
  Q5의 응답의 방향을 따라가되, Q1의 응답에 따라서 강도를 동일한 간격으로 조절한다.
  이때, 극도로 응답이 변하는 경우는 향후 선호도 예측에 방해가 된다고 판단되어 제거할 계획

  start_res : Q1의 응답 결과
  last_res : Q5의 응답 결과
  new_res : 조합된 응답 결과
  '''
  start_res = row.iloc[-2]  # 응답 시작시 선호도 (Q1)
  last_res = row.iloc[-1]  # 응답 마무리시 선호도 (Q5) 선호 - 2, 비선호 - 1

  # 복잡한 조건 작성 (예시)
  if start_res == 4 and last_res == 1: #매우 선호 & 비선호 - 삭제
    return -0.25
  elif start_res == 4 and last_res == 2: #매우 선호 & 선호
    return 1
  elif start_res == 3 and last_res == 1: #선호 & 비선호
    return -0.5
  elif start_res == 3 and last_res == 2: #선호 & 선호
    return 0.75
  elif start_res == 2 and last_res == 1: #비선호 & 비선호
    return -0.75
  elif start_res == 2 and last_res == 2: #비선호 & 선호
    return 0.5
  elif start_res == 1 and last_res == 1: #매우 비선호 & 비선호
    return -1
  elif start_res == 1 and last_res == 2: #매우 비선호 & 선호 - 삭제
    return 0.25

def valid_compare(row):
  '''
  Valid에 존재하는 응답결과 1, 2를 선호도 예측을 위한 값으로 변환

  last_res : Q5의 응답 결과
  '''
  last_res = row.iloc[-1]  # 응답 마무리시 선호도 (Q5) 선호 - 2, 비선호 - 1
  if last_res == 1: #비선호
    return -1
  elif last_res == 2: #선호
    return 1

# 함수 적용
df_train_t1['new_res'] = df_train_t1.apply(compare_values, axis=1)
df_valid_t1['new_res'] = df_valid_t1.apply(valid_compare, axis=1)

결측치 제거 - (Q1 매우 비선호 & Q5 선호),(Q1 매우 선호 & Q5 비선호)

In [18]:
df_train_na1 = df_train_t1[df_train_t1.new_res != -0.25]
df_train_na2 = df_train_na1[df_train_na1.new_res != 0.25]

df_train_t1 = df_train_na2.copy()

In [19]:
id_df_train = df_train_t1.R_id.value_counts().reset_index(name='count') # 빈도 수 계산
id_df_valid = df_valid_t1.R_id.value_counts().reset_index(name='count') # 빈도 수 계산

# id_df_train와 id_df_valid id를 기준으로 결합
id_counts = id_df_train.merge(id_df_valid, on='R_id', how='outer')

# id별로 train/valid의 응답 합계 계산
id_counts['count'] = id_counts['count_x'].fillna(0) + id_counts['count_y'].fillna(0)

# 합계 컬럼만 남기고 응답 수 상위 100명의 id만 남겨 내림차순 정렬
id_counts = id_counts.drop(columns=['count_x', 'count_y']).sort_values(by='count', ascending=False).nlargest(100, 'count')
id_list = id_counts.R_id.values

df_train_t11 = df_train_t1[df_train_t1.R_id.isin(id_list)]
df_valid_t11 = df_valid_t1[df_valid_t1.R_id.isin(id_list)]

In [20]:
df_train_t11.rename(columns={'R_id':'id','new_res':'rating','img_id':'image_id'},inplace=True)
df_valid_t11.rename(columns={'R_id':'id','new_res':'rating','img_id':'image_id'},inplace=True)

df_train_t11['image_id'] = df_train_t11['image_id'].astype(str).str.zfill(5)

In [21]:
#상위 100명의 ID를 구해온다.
rating_matrix = df_train_t11.pivot_table(index='id', columns='image_id', values='rating',aggfunc='mean')
valid_df_t = df_valid_t11[['id','rating','image_id']]

### **1. Valid 설문 결과 사용하지 않고 예측**

#### 피처 벡터 & 유사도 생성

##### 이미지별 응답 결과를 최빈값 기준으로 병합
##### 이때 Valid 데이터에 존재하는 응답은 사용하지 않는다
##### Valid에만 존재하는 이미지 응답 결과를 대체하기 위해 기존의 유사도 행렬 기준으로 가장 유사한 이미지의 응답 결과를 가져온다

In [22]:
question_list = ['Q1', 'Q2', 'Q3', 'Q411', 'Q412', 'Q413', 'Q414', 'Q4201','Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209','Q4210', 'Q4211', 'Q4212', 'Q4213', 'Q4214', 'Q4215', 'Q4216']

df_train_t2 = df_train_t[['img_id'] + question_list]
df_valid_t2 = df_valid_t[['img_id'] + question_list]

df_ft = df_train_t2.copy()

In [23]:
#Train 설문지에 존재하는 이미지 리스트
img_id_list = df_train_t2.img_id.unique().tolist()
print(len(img_id_list))

#Valid 설문지에 존재하는 이미지 리스트
img_id_list_v = df_valid_t2.img_id.unique().tolist()
print(len(img_id_list_v))
print(len(set(img_id_list + img_id_list_v)))

#Valid에만 재하는 이미지 리스트존
inter_valid = list(set(img_id_list_v) - set(img_id_list))
print(len(inter_valid))

4066
951
4486
420


In [24]:
def calculate_mode(series):
  '''
  특정 이미지에 대한 응답을 최빈값을 이용해서 정리하는 함수
  만일 동일한 빈도 수의 응답이 존재하면 random으로 하나를 선택한다.
  '''
  np.random.seed(42)
  mode = series.mode()
  if len(mode) > 1: return np.random.choice(mode)
  return mode[0]


grouped = df_ft.groupby('img_id').agg(lambda x: calculate_mode(x) if x.name != 'img_id' else x.iloc[0])

응답 결과에 따른 유사도를 구하기 위해서 다양한 응답 결과가 존재하는 설문을 원-핫 인코딩을 수행. 또한 이진 분류 응답임에도 두 응답 모두 유의미한 응답이라면 원-핫 인코딩 수행

In [25]:
# Q2,Q3, Q411, Q412, Q413, Q414 원핫 인코딩 수행
grouped_one = pd.get_dummies(grouped, columns=['Q2', 'Q3','Q411','Q412','Q413','Q414'],dtype=int)

#이진 변수들 0,1 이진 변수화 진행
fin_group = grouped_one[['Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209', 'Q4210',
                   'Q4211','Q4212','Q4213','Q4214', 'Q4215', 'Q4216', 'Q2_1', 'Q2_2', 'Q2_3', 'Q3_1',
                   'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q411_1', 'Q411_2', 'Q411_3','Q412_1', 'Q412_2', 'Q413_1',
       'Q413_2', 'Q414_1', 'Q414_2']].applymap(lambda x: 1 if x != 0 else 0)
fin_group.index = fin_group.index.astype(str)

In [26]:
#fin_group 이미지 단어 변수 그룹화. 비슷한 의미를 가지는 단어들끼리 그룹화

fin_group_new = fin_group.copy()
fin_group_new['new_1'] =(fin_group_new['Q4202'] + fin_group_new['Q4204'] + fin_group_new['Q4215'])
fin_group_new['new_2'] = (fin_group_new['Q4205'] + fin_group_new['Q4208'] + fin_group_new['Q4216'])
fin_group_new['new_3'] = (fin_group_new['Q4206'] + fin_group_new['Q4207'] + fin_group_new['Q4209'])
fin_group_new['new_4'] = (fin_group_new['Q4210'] + fin_group_new['Q4211'] + fin_group_new['Q4212'] )
fin_group_new['new_5'] = (fin_group_new['Q4213'] + fin_group_new['Q4214'])

#그룹화 된 변수들은 drop
feature_matrix_new = fin_group_new.drop(columns= ['Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209', 'Q4210','Q4211', 'Q4212', 'Q4213', 'Q4214', 'Q4215', 'Q4216'])

Valid에 누락된 이미지 응답 결과를 Train 값으로 대체

In [27]:
sim_df = pd.read_csv('/content/drive/MyDrive/DCC/제출/sim_ori.csv')
sim_df.index = sim_df.columns
sim_df.head(3)

Unnamed: 0,02498,38421,30434,48628,22057,02247,44386,39725,38629,11610,21483,42595,...,17783,17616,32385,29693,23899,25649,24685,16732,34952,41341,20593,47967
2498,1.000001,0.675767,0.785014,0.801968,0.76037,0.78554,0.816377,0.719492,0.767426,0.735139,0.748646,0.836436,...,0.856784,0.853685,0.718745,0.849325,0.804929,0.771871,0.688192,0.79284,0.882943,0.793323,0.702265,0.789978
38421,0.675767,1.000001,0.735742,0.663379,0.732917,0.666624,0.743502,0.800364,0.689497,0.736814,0.798023,0.699084,...,0.602588,0.752113,0.797303,0.57752,0.788706,0.638761,0.718256,0.647658,0.690893,0.757572,0.834288,0.741483
30434,0.785014,0.735742,1.0,0.815705,0.757281,0.822041,0.811331,0.738502,0.825745,0.746171,0.874997,0.786455,...,0.790323,0.734472,0.656305,0.762896,0.721047,0.707614,0.721573,0.725356,0.778056,0.836816,0.79311,0.709165


Valid에만 존재하는 이미지와 가장 유사도가 높은 이미지의 ID를 딕셔너리에 저장

In [28]:
result = dict()
temp = []

for id in inter_valid:
  top_list = sim_df[id].sort_values(ascending=False).drop(id).index.tolist()
  for top_id in top_list:
    if top_id in img_id_list:
      temp.append(top_id)
      if len(temp) == 1:
        result[id] = temp
        temp = []
        break
print(len(result))

420


응답 결과가 존재하지 않는 Valid의 이미지 응답을 대체

In [29]:
for key, item in result.items():
  print(f'Valid Id : {key}')
  temp_ = feature_matrix_new.loc[item]
  #mode_vector = temp_.mode().iloc[0]
  feature_matrix_new.loc[key] = temp_.values.tolist()[0]

feature_matrix_new.shape

Valid Id : 34080
Valid Id : 02275
Valid Id : 12730
Valid Id : 06011
Valid Id : 25940
Valid Id : 42967
Valid Id : 02946
Valid Id : 14357
Valid Id : 50836
Valid Id : 08621
Valid Id : 17742
Valid Id : 15508
Valid Id : 16819
Valid Id : 12106
Valid Id : 13991
Valid Id : 13398
Valid Id : 05760
Valid Id : 15319
Valid Id : 13781
Valid Id : 00624
Valid Id : 06190
Valid Id : 15341
Valid Id : 12847
Valid Id : 47408
Valid Id : 00161
Valid Id : 17787
Valid Id : 00511
Valid Id : 12668
Valid Id : 08008
Valid Id : 11163
Valid Id : 07077
Valid Id : 28480
Valid Id : 54129
Valid Id : 06985
Valid Id : 17235
Valid Id : 07316
Valid Id : 15998
Valid Id : 34952
Valid Id : 24643
Valid Id : 24535
Valid Id : 00359
Valid Id : 27899
Valid Id : 16538
Valid Id : 03791
Valid Id : 26120
Valid Id : 29658
Valid Id : 03293
Valid Id : 06358
Valid Id : 29063
Valid Id : 16915
Valid Id : 33006
Valid Id : 17135
Valid Id : 24770
Valid Id : 01123
Valid Id : 15910
Valid Id : 15394
Valid Id : 17004
Valid Id : 00716
Valid Id : 187

(4486, 25)

##### 이미지 피처 벡터 생성

In [None]:
model_weight_path = "/content/drive/MyDrive/DCC/제출/segmentation_batch16_lr1e-4_argu_o_schedule_o_labelsmoothing_e0.05.pth"

model = models.resnet18()
model.fc = nn.Linear(model.fc.in_features, 31)
model.load_state_dict(torch.load(model_weight_path, map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
class FeatureExtractor:
  '''
  ResNet 모델에서 특정 레이어의 Feature를 추출하기 위한 클래스.

  __init__ : avgpool 레이어에 forward hook 등록
  hook_fn : avgpool 레이어 출력(feature)을 Flatten 후 저장
  close : hook 제거

  '''
  def __init__(self, model):
    self.model = model
    self.feature = None
    self.hook = self.model.avgpool.register_forward_hook(self.hook_fn)

  def hook_fn(self, module, input, output):
    self.feature = output.view(output.size(0), -1).detach().cpu().numpy()

  def close(self):
    self.hook.remove()

#**모델에 입력하기 위한 이미지 전처리**#
preprocess = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()
extractor = FeatureExtractor(model)

features_dict = {}
unique_image_id = []

with zipfile.ZipFile('/content/drive/MyDrive/DCC/2024 데이터 크리에이터 캠프 대학부 데이터셋.zip', 'r') as zip_file:
  for file_name in tqdm(zip_file.namelist()):
    if not file_name.endswith('.jpg'): continue

    #**train 파일과 valid 파일에 존재하는 중복 파일 방지**#
    if file_name.startswith('training_image/'):
      temp_image_id = file_name.split('_')[2]
      if temp_image_id in unique_image_id : continue
    else:
      temp_image_id = file_name.split('_')[2]
      if temp_image_id in unique_image_id : continue

    unique_image_id.append(temp_image_id)
    #**file_name에 해당하는 이미지를 불러와서 feature vector 추출**#
    with zip_file.open(file_name) as file:
      image = Image.open(BytesIO(file.read())).convert("RGB")
      input_tensor = preprocess(image)
      input_batch = input_tensor.unsqueeze(0).to(device)

      #**특징 추출**#
      with torch.no_grad():
        model(input_batch)
        features = extractor.feature
        features_dict[temp_image_id] = features

#**딕셔너리 형태로 저장된 객체 데이터 프레임으로 변환**#
features_dict = {k: v.squeeze() for k, v in features_dict.items()}
feature_matrix = pd.DataFrame.from_dict(features_dict, orient='index')

100%|██████████| 252754/252754 [21:53<00:00, 192.44it/s]


In [None]:
ft_mat = feature_matrix.copy()

In [30]:
#이미지 피처 벡터 불러오기
ft_mat = pd.read_csv('/content/drive/MyDrive/DCC/제출/final_feature_matrix.csv')
ft_mat.rename(columns={'Unnamed: 0':'img_id'},inplace=True)
ft_mat.img_id = ft_mat.img_id.astype(str).str.zfill(5)
ft_mat.set_index('img_id',inplace=True)

print(ft_mat.shape)

(4486, 512)


##### 응답 결과 벡터와 이미지 피처 벡터 결합

In [31]:
# 벡터별 가중치 조절. 이미지 전체의 가중치 합 = 설문 조사 전체 가중치 합
ori_pre = ft_mat.copy()
ori_pre = ori_pre *(1/512)

new_pre = feature_matrix_new.copy()
new_pre = new_pre *(1/25)

#이어붙인 feature matrix
ft_matrix_q = pd.concat([ori_pre,new_pre],axis=1)

print(ft_matrix_q.shape)

(4486, 537)


In [32]:
#이어붙인 feature matrix의 유사도 행렬 계산
sim_total = pd.DataFrame(cosine_similarity(ft_matrix_q), index=ft_matrix_q.index, columns=ft_matrix_q.index)
col = sim_total.columns.astype(str).str.zfill(5)
sim_total.columns = col
sim_total.index = col
sim_total.head(3)

img_id,02498,38421,30434,48628,22057,02247,44386,39725,38629,11610,21483,42595,...,17783,17616,32385,29693,23899,25649,24685,16732,34952,41341,20593,47967
img_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2498,1.0,0.282606,0.469933,0.575371,0.676416,0.39259,0.58747,0.19076,0.425966,0.255293,0.420334,0.217371,...,0.480149,0.310186,0.318138,0.313949,0.625697,0.34213,0.38032,0.292299,0.399747,0.467925,0.38651,0.292892
38421,0.282606,1.0,0.567371,0.110635,0.194783,0.4636,0.265155,0.775197,0.205335,0.416331,0.402775,0.593521,...,0.458427,0.634097,0.696508,0.105525,0.56769,0.501883,0.714237,0.460414,0.560974,0.47129,0.853198,0.718067
30434,0.469933,0.567371,1.0,0.404972,0.387165,0.464697,0.510482,0.478206,0.411264,0.305772,0.17648,0.689493,...,0.548476,0.349671,0.547364,0.251865,0.707885,0.39231,0.44765,0.562728,0.452779,0.545692,0.689466,0.334738


#### 성능 확인

저장된 파일을 불러올때

In [None]:
rating_matrix = pd.read_csv('/content/drive/MyDrive/DCC/Mission3_upgrade/rating_matrix_Q1Q5_deloutlier.csv')
if 'user_id' in rating_matrix.columns:
  rating_matrix.set_index('user_id',inplace=True)
elif 'id' in rating_matrix.columns:
  rating_matrix.set_index('id',inplace=True)

sim_df = pd.read_csv('/content/drive/MyDrive/DCC/Mission3_upgrade/sim_df_Novalid.csv')
if 'img_id' in sim_df.columns:
  sim_df.set_index('img_id',inplace=True)
  sim_df.index = sim_df.index.astype(str).str.zfill(5)
else:
  sim_df.index = sim_df.columns
valid_df_t = pd.read_csv('/content/drive/MyDrive/DCC/Mission3_upgrade/valid_rating.csv')
valid_df_t.image_id = valid_df_t.image_id.astype(str).str.zfill(5)

파일 불러옴 없이 곧바로 실행할때

In [33]:
#rating matrix
if 'user_id' in rating_matrix.columns:
  rating_matrix.set_index('user_id',inplace=True)
elif 'id' in rating_matrix.columns:
  rating_matrix.set_index('id',inplace=True)

#valid_df_t
valid_df_t.image_id = valid_df_t.image_id.astype(str).str.zfill(5)

#similarity matrix
sim_df = sim_total.copy()
if 'img_id' in sim_df.columns:
  sim_df.set_index('img_id',inplace=True)
  sim_df.index = sim_df.index.astype(str).str.zfill(5)
else:
  sim_df.index = sim_df.columns

##### 통계량 계산 함수

In [34]:
def predict_and_evaluate(func, rating_matrix, sim_df, valid_df_t, threshold=0, k1=1, min_similarity1=0.5, k2=5, min_similarity2=0.5, min_similarity3=0.5, min_similarity4=0.5):
  '''
  아이템 기반 협업 필터링 결과를 확인하는 함수.
  평가 지표 : 혼동 행렬, 정확도, f1-score

  func : 아이템 협업 필터링을 위한 베이스 함수
  rating_matrix : 유틸리티 행렬
  sim_df : feature 유사도 행렬
  valid_df_t : 검증 데이터
  threshold : 선호도 판단 기준
  k : 유사도 선택 개수
  '''

  # 모든 예측 수행
  if func.__name__ == 'predict':
    pred_list = [func(valid_df_t.iloc[i, 0], str(valid_df_t.iloc[i, 2]), threshold = threshold) for i in range(len(valid_df_t))]

  elif func.__name__ == 'predict_v2_1':#2번 변형
    pred_list = [func(valid_df_t.iloc[i, 0], str(valid_df_t.iloc[i, 2]), threshold = threshold, k1 = k1, min_similarity1 = min_similarity1, min_similarity2 = min_similarity2, k2 = k2) for i in range(len(valid_df_t))]


  # 예측 결과 저장
  valid_df_t['predict'] = pred_list

  # 평가 지표 계산
  conf_matrix = confusion_matrix(valid_df_t['rating'], valid_df_t['predict'])
  acc = accuracy_score(valid_df_t['rating'], valid_df_t['predict'])
  rcs = recall_score(valid_df_t['rating'], valid_df_t['predict'],average='binary')
  pcs = precision_score(valid_df_t['rating'], valid_df_t['predict'])
  f1 = f1_score(valid_df_t['rating'], valid_df_t['predict'], average='binary')

  # # 결과 출력 - 하이퍼 파라미터 튜닝 중에는 잠시 출력 정지
  print("Confusion Matrix:\n", pd.DataFrame(conf_matrix, index=['N', 'P'], columns=['N', 'P']))
  print(f"Accuracy: {acc:.4f}")
  # print(f"F1-Score: {f1:.4f}")

  return acc, rcs, pcs, f1

##### 1번 Base 함수

In [35]:
def predict(user_id, item_id, threshold):
  '''
  feature 유사도 행렬을 이용해서 아이템 기반 협업 필터링을 수행하는 기본적인 함수.

  user_id : 사용자 식별 ID
  item_id : 아이템 식별 ID
  threshold : 선호도 판단 기준

  rating_matrix : 유틸리티 행렬
  sim_df : feature 유사도 행렬
  '''
  # 유저가 평가한 아이템의 선호도 1 또는 0 또는 Nan

  user_ratings = rating_matrix.loc[user_id].dropna()

  # 해당 아이템과 평가된 아이템 간 유사도
  similarities = sim_df[item_id].loc[user_ratings.index]

  if item_id in similarities.index: similarities.drop(index=item_id)

  valid_indices = similarities.index

  # 유사도와 평점의 내적 계산
  prediction = np.dot(similarities, user_ratings) / similarities.sum()

  # 임계값 기준으로 이진 분류 (1 또는 0)
  return 1 if prediction >= threshold else -1

predict_and_evaluate(predict, rating_matrix, sim_df, valid_df_t, threshold = 0, k1=1)

Confusion Matrix:
      N    P
N  561   99
P  248  194
Accuracy: 0.6851


(0.6851179673321234,
 0.43891402714932126,
 0.6621160409556314,
 0.527891156462585)

##### 2번 함수

In [36]:
def predict_v2_1(user_id, item_id, threshold, k1, min_similarity1, min_similarity2, k2):
  '''
  위에 버전에서 min_similarity = 0.92에 대해 k1 = 1 인 아이템 찾음.
  나머지 아이템 들에 대해서는 min_similarity2, k2 = k2를 적용함.

  user_id : 사용자 식별 ID
  item_id : 아이템 식별 ID
  threshold : 선호도 판단 기준
  k1: 높은 유사도 선택 개수
  k2 : 유사도 선택 개수
  min_similarity1 : 최소 유사도 값
  min_similarity2: min_similarity를 만족시키지 못한 아이템들의 차선 similarity
  rating_matrix : 유틸리티 행렬
  sim_df : feature 유사도 행렬
      '''
  user_ratings = rating_matrix.loc[user_id].dropna()
  similarities = sim_df[item_id].loc[user_ratings.index]

  valid_indices_pri = similarities[similarities >= min_similarity1].nlargest(k1).index
  valid_indices = similarities[similarities >= min_similarity2].nlargest(k2).index

  #유사도가 0.92 이상인게 있으면,k1 = 1 사용
  if len(valid_indices_pri) > 0:
    # 유사도 0.92 이상을 만족하는 상위 1개 아이템의 유사도와 평점을 사용, 정확도 매우 좋음
    filtered_similarities = similarities[valid_indices_pri]
    filtered_ratings = user_ratings[valid_indices_pri]


  #유사도가 0.92이상인게 없을 때,min_similarity2 기준으로 상위 k개 선택
  elif len(valid_indices)>0:

    filtered_similarities = similarities[valid_indices]
    filtered_ratings = user_ratings[valid_indices]
    weighted_sum = np.dot(filtered_similarities, filtered_ratings)
    prediction = weighted_sum / filtered_similarities.sum()

  else:
    # 유사도가 충분하지 않아 예측 불가
    filtered_ratings = user_ratings.values
    filtered_similarities = similarities


  weighted_sum = np.dot(filtered_similarities, filtered_ratings)
  prediction = weighted_sum / filtered_similarities.sum()

  # 임계값 기준으로 이진 분류 (-1 또는 1)

  return 1 if prediction >= threshold else -1

predict_and_evaluate(predict_v2_1, rating_matrix, sim_df, valid_df_t, threshold= 0, k1=1, min_similarity1 = 0.92, min_similarity2 = 0.41, k2 = 34)

Confusion Matrix:
      N    P
N  602   58
P  110  332
Accuracy: 0.8475


(0.8475499092558983, 0.751131221719457, 0.8512820512820513, 0.7980769230769231)

###### Optuna 최적화

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.6-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import optuna

In [None]:
def objective_predict_v1(trial):
  '''
  주어진 함수에서 최적의 하이퍼 파라미터를 탐색하는 함수
  최적화 기준은 Accuracy로 진행. 정의된 f1 score, recall, precision과 같은 지표로도 최적화 수행 가능
  '''
  # 하이퍼파라미터 탐색 공간 정의
  threshold = trial.suggest_float("threshold", -0.5, 0.5, step=0.1)
  # 하이퍼파라미터 튜닝
  valid_df_t['predict'] = valid_df_t.apply(lambda x: predict(x[0], str(x[2]), threshold), axis=1)
  f1 = f1_score(valid_df_t['rating'], valid_df_t['predict'])
  acc = accuracy_score(valid_df_t['rating'], valid_df_t['predict'])
  rcs = recall_score(valid_df_t['rating'], valid_df_t['predict'])
  pcs = precision_score(valid_df_t['rating'], valid_df_t['predict'])

  return acc

def objective_predict_v2_1(trial):
  threshold = trial.suggest_float("threshold", -0.5, 0.5, step=0.1)
  k = trial.suggest_int("k", 1,1)
  min_similarity1 = trial.suggest_float("min_similarity1", 0.7, 0.99, step=0.01)
  min_similarity2 = trial.suggest_float("min_similarity2", 0.2, 0.6, step=0.01)
  k2 = trial.suggest_int("k2", 30, 50)
  valid_df_t['predict'] = valid_df_t.apply(lambda x: predict_v2_1(x[0], str(x[2]),threshold,k,min_similarity1,min_similarity2, k2), axis=1)

  f1 = f1_score(valid_df_t['rating'], valid_df_t['predict'])
  acc = accuracy_score(valid_df_t['rating'], valid_df_t['predict'])
  rcs = recall_score(valid_df_t['rating'], valid_df_t['predict'])
  pcs = precision_score(valid_df_t['rating'], valid_df_t['predict'])

  return acc

In [None]:
# Optuna 튜닝 실행
predict_and_evaluate(predict, rating_matrix, sim_df, valid_df_t)

study_v1 = optuna.create_study(direction="maximize")
study_v1.optimize(objective_predict_v1, n_trials=30)

print("Best parameters:", study_v1.best_params)
print("Best Accuracy:", study_v1.best_value)

[I 2024-11-26 14:22:34,352] A new study created in memory with name: no-name-e8505801-adc0-4324-b293-c0ede6c996fe


Confusion Matrix:
      N    P
N  561   99
P  248  194


[I 2024-11-26 14:22:36,530] Trial 0 finished with value: 0.6923774954627949 and parameters: {'threshold': -0.09999999999999998}. Best is trial 0 with value: 0.6923774954627949.
[I 2024-11-26 14:22:39,094] Trial 1 finished with value: 0.573502722323049 and parameters: {'threshold': -0.4}. Best is trial 0 with value: 0.6923774954627949.
[I 2024-11-26 14:22:41,472] Trial 2 finished with value: 0.6851179673321234 and parameters: {'threshold': 0.0}. Best is trial 0 with value: 0.6923774954627949.
[I 2024-11-26 14:22:45,966] Trial 3 finished with value: 0.6188747731397459 and parameters: {'threshold': 0.30000000000000004}. Best is trial 0 with value: 0.6923774954627949.
[I 2024-11-26 14:22:48,811] Trial 4 finished with value: 0.6651542649727767 and parameters: {'threshold': 0.10000000000000009}. Best is trial 0 with value: 0.6923774954627949.
[I 2024-11-26 14:22:51,097] Trial 5 finished with value: 0.6851179673321234 and parameters: {'threshold': 0.0}. Best is trial 0 with value: 0.692377495

Best parameters: {'threshold': -0.09999999999999998}
Best Accuracy: 0.6923774954627949


In [None]:
predict_and_evaluate(predict_v2_1, rating_matrix, sim_df, valid_df_t, threshold= -0.1, k1=1, min_similarity1 = 0.92, min_similarity2 = 0.6, k2 = 37)

study_v2 = optuna.create_study(direction="maximize")
study_v2.optimize(objective_predict_v2_1, n_trials=300)

print("Best parameters:", study_v2.best_params)
print("Best Accuracy:", study_v2.best_value)

[I 2024-11-26 14:55:33,523] A new study created in memory with name: no-name-68c8539f-fade-43ba-b84e-bbfc96b53c5c


Confusion Matrix:
      N    P
N  574   86
P   85  357


[I 2024-11-26 14:55:38,408] Trial 0 finished with value: 0.809437386569873 and parameters: {'threshold': -0.5, 'k': 1, 'min_similarity1': 0.83, 'min_similarity2': 0.4, 'k2': 47}. Best is trial 0 with value: 0.809437386569873.
[I 2024-11-26 14:55:43,567] Trial 1 finished with value: 0.8439201451905626 and parameters: {'threshold': 0.10000000000000009, 'k': 1, 'min_similarity1': 0.82, 'min_similarity2': 0.35, 'k2': 31}. Best is trial 1 with value: 0.8439201451905626.
[I 2024-11-26 14:55:47,833] Trial 2 finished with value: 0.8439201451905626 and parameters: {'threshold': 0.10000000000000009, 'k': 1, 'min_similarity1': 0.87, 'min_similarity2': 0.39, 'k2': 42}. Best is trial 1 with value: 0.8439201451905626.
[I 2024-11-26 14:55:50,807] Trial 3 finished with value: 0.838475499092559 and parameters: {'threshold': -0.19999999999999996, 'k': 1, 'min_similarity1': 0.8899999999999999, 'min_similarity2': 0.46, 'k2': 46}. Best is trial 1 with value: 0.8439201451905626.
[I 2024-11-26 14:55:54,037] 

Best parameters: {'threshold': -0.09999999999999998, 'k': 1, 'min_similarity1': 0.94, 'min_similarity2': 0.31, 'k2': 44}
Best Accuracy: 0.8548094373865699


In [None]:
predict_and_evaluate(predict, rating_matrix, sim_df, valid_df_t, threshold=study_v1.best_params['threshold'])

predict_and_evaluate(predict_v2_1, rating_matrix, sim_df, valid_df_t, threshold=study_v2.best_params['threshold'], k1=study_v2.best_params['k'], k2 = study_v2.best_params['k2'], min_similarity1 = study_v2.best_params['min_similarity1'], min_similarity2 = study_v2.best_params['min_similarity2'])

Confusion Matrix:
      N    P
N  499  161
P  178  264
Confusion Matrix:
      N    P
N  587   73
P   87  355


(0.8548094373865699,
 0.8031674208144797,
 0.8294392523364486,
 0.8160919540229885)

###### Grid Search 최적화

In [None]:
# 하이퍼파라미터 후보들, min_similarity2 구하는 중

min_1 = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95]
min_2 = [0.45,0.5,0.55,0.6,0.65,0.7]
k2_values = [25,30,35,40]
thresholds = [-0.1,0,0.1]
# 최적의 Accuracy와 하이퍼파라미터를 저장할 변수 초기화
best_f1 = 0
best_acc = 0
best_k2 = None
best_threshold = None

for threshold in thresholds:
  for k2 in k2_values:
      for min_similarity2 in min_2:
        for min_similarity1 in min_1:
          # 성능 평가
          acc, rcs, pcs, f1 = predict_and_evaluate(predict_v2_1, rating_matrix,
                                    sim_df, valid_df_t, threshold= threshold
                                    , k1=1, min_similarity1 = min_similarity1, min_similarity2 = min_similarity2, k2 = k2)

          # 현재 accuracy가 최고값보다 높으면 업데이트
          if acc > best_acc:
              best_f1 = f1
              best_acc = acc
              best_k2 = k2
              best_min1 = min_similarity1
              best_min2 = min_similarity2
              best_threshold = threshold


# 최적의 k2, min_similarity2, min_similarity1,threshold 출력
print(f"Best F1-Score: {best_f1}")
print(f"Best Accuracy: {best_acc}")
print(f"Best k2: {best_k2}")
print(f"Best min_similarity2: {best_min2}")
print(f"Best threshold: {best_threshold}")
print(f"Best min_similarity1: {best_min1}")

Accuracy: 0.8448
Accuracy: 0.8466
Accuracy: 0.8466
Accuracy: 0.8485
Accuracy: 0.8485
Accuracy: 0.8466
Accuracy: 0.8439
Accuracy: 0.8457
Accuracy: 0.8457
Accuracy: 0.8475
Accuracy: 0.8475
Accuracy: 0.8457
Accuracy: 0.8412
Accuracy: 0.8421
Accuracy: 0.8421
Accuracy: 0.8439
Accuracy: 0.8439
Accuracy: 0.8421
Accuracy: 0.8439
Accuracy: 0.8448
Accuracy: 0.8448
Accuracy: 0.8457
Accuracy: 0.8457
Accuracy: 0.8439
Accuracy: 0.8466
Accuracy: 0.8475
Accuracy: 0.8475
Accuracy: 0.8485
Accuracy: 0.8485
Accuracy: 0.8466
Accuracy: 0.8475
Accuracy: 0.8466
Accuracy: 0.8475
Accuracy: 0.8485
Accuracy: 0.8485
Accuracy: 0.8466
Accuracy: 0.8430
Accuracy: 0.8448
Accuracy: 0.8448
Accuracy: 0.8466
Accuracy: 0.8466
Accuracy: 0.8448
Accuracy: 0.8439
Accuracy: 0.8457
Accuracy: 0.8457
Accuracy: 0.8475
Accuracy: 0.8475
Accuracy: 0.8457
Accuracy: 0.8412
Accuracy: 0.8421
Accuracy: 0.8421
Accuracy: 0.8439
Accuracy: 0.8439
Accuracy: 0.8421
Accuracy: 0.8439
Accuracy: 0.8448
Accuracy: 0.8448
Accuracy: 0.8457
Accuracy: 0.84

### **Valid 설문 결과 사용하지 않은 최적 모델 성능**

In [37]:
predict_and_evaluate(predict_v2_1, rating_matrix, sim_df, valid_df_t,threshold= 0.1, k1 = 1, k2 = 25, min_similarity1=0.93,  min_similarity2 = 0.45)

Confusion Matrix:
      N    P
N  625   35
P  120  322
Accuracy: 0.8593


(0.8593466424682396,
 0.7285067873303167,
 0.9019607843137255,
 0.8060075093867334)

### **2. Valid 설문 결과 사용하여 예측**

#### 피처 벡터 & 유사도 생성

##### 응답 결과 피처 벡터 생성

In [38]:
# train, valid 데이터셋 모두 사용
df_concat = pd.concat([df_train_t2,df_valid_t2])

In [39]:
grouped2 = df_concat.groupby('img_id').agg(lambda x: calculate_mode(x) if x.name != 'img_id' else x.iloc[0])

In [40]:
# Q2,Q3, Q411, Q412, Q413, Q414 원핫 인코딩 수행
grouped_one = pd.get_dummies(grouped2, columns=['Q2', 'Q3','Q411','Q412','Q413','Q414'],dtype=int)

#이진 변수들 0,1 이진 변수화 진행
fin_group = grouped_one[['Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209', 'Q4210',
                   'Q4211','Q4212','Q4213','Q4214', 'Q4215', 'Q4216', 'Q2_1', 'Q2_2', 'Q2_3', 'Q3_1',
                   'Q3_2', 'Q3_3', 'Q3_4', 'Q3_5', 'Q3_6', 'Q3_7', 'Q3_8', 'Q411_1', 'Q411_2', 'Q411_3','Q412_1', 'Q412_2', 'Q413_1',
       'Q413_2', 'Q414_1', 'Q414_2']].applymap(lambda x: 1 if x != 0 else 0)
fin_group.index = fin_group.index.astype(str)

In [41]:
fin_group_new = fin_group.copy()

#fin_group 이미지 단어 변수 그룹화. 비슷한 의미를 가지는 단어들끼리 그룹화
fin_group_new['new_1'] =(fin_group_new['Q4202'] + fin_group_new['Q4204'] + fin_group_new['Q4215'])
fin_group_new['new_2'] = (fin_group_new['Q4205'] + fin_group_new['Q4208'] + fin_group_new['Q4216'])
fin_group_new['new_3'] = (fin_group_new['Q4206'] + fin_group_new['Q4207'] + fin_group_new['Q4209'])
fin_group_new['new_4'] = (fin_group_new['Q4210'] + fin_group_new['Q4211'] + fin_group_new['Q4212'] )
fin_group_new['new_5'] = (fin_group_new['Q4213'] + fin_group_new['Q4214'])

#그룹화 된 변수들은 drop
feature_matrix_new = fin_group_new.drop(columns= ['Q4202', 'Q4203', 'Q4204', 'Q4205', 'Q4206', 'Q4207', 'Q4208', 'Q4209', 'Q4210','Q4211', 'Q4212', 'Q4213', 'Q4214', 'Q4215', 'Q4216'])

In [None]:
# feature matrix 불러와서 사용할 때
#feature_matrix_new = pd.read_csv("/content/drive/MyDrive/DCC/행렬/fin_survey_feature.csv", index_col = ['Unnamed: 0'])
#feature_matrix_new.index = feature_matrix_new.index.astype(str).str.zfill(5)

##### 응답 결과 벡터와 이미지 피처 벡터 결합

In [42]:
# 벡터별 가중치 조절. 이미지 전체의 가중치 합 = 설문 조사 전체 가중치 합

#이미지 피처 벡터
ori_pre = ft_mat.copy()
ori_pre = ori_pre *(1/512)

#설문 결과 피처 벡터
new_pre = ft_mat.copy()
new_pre = feature_matrix_new *(1/25)

In [43]:
#이어붙인 feature matrix
ft_matrix_q = pd.concat([ori_pre,new_pre],axis=1)

In [None]:
ft_matrix_q

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,Q412_2,Q413_1,Q413_2,Q414_1,Q414_2,new_1,new_2,new_3,new_4,new_5
02498,0.004302,0.002615,0.000573,0.001561,0.000619,0.003800,0.003354,0.001978,0.001528,0.000266,...,0.00,0.04,0.00,0.00,0.04,0.08,0.00,0.00,0.04,0.00
38421,0.001882,0.001731,0.003102,0.001573,0.003531,0.003027,0.002773,0.003218,0.000370,0.000260,...,0.04,0.00,0.04,0.00,0.04,0.00,0.00,0.00,0.00,0.04
30434,0.002908,0.002350,0.001500,0.001100,0.002250,0.002782,0.000613,0.002049,0.002047,0.001050,...,0.04,0.00,0.04,0.00,0.04,0.08,0.00,0.00,0.04,0.00
48628,0.006144,0.005431,0.001632,0.001822,0.000827,0.004789,0.005125,0.001201,0.000314,0.000016,...,0.00,0.04,0.00,0.04,0.00,0.08,0.04,0.04,0.00,0.00
22057,0.001103,0.003216,0.000251,0.001613,0.000345,0.001056,0.002117,0.000789,0.002091,0.000437,...,0.00,0.04,0.00,0.04,0.00,0.08,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16732,0.002409,0.001336,0.000483,0.003351,0.001867,0.002403,0.000842,0.000386,0.000868,0.001149,...,0.04,0.00,0.04,0.00,0.04,0.00,0.00,0.00,0.00,0.00
34952,0.003333,0.002878,0.000303,0.003313,0.000157,0.001925,0.002301,0.001878,0.001304,0.000724,...,0.00,0.04,0.00,0.00,0.04,0.04,0.00,0.00,0.04,0.04
41341,0.000340,0.000864,0.000599,0.002048,0.002045,0.002834,0.001091,0.004434,0.001055,0.000363,...,0.04,0.00,0.04,0.00,0.04,0.08,0.04,0.00,0.04,0.00
20593,0.001508,0.001709,0.002727,0.003315,0.003652,0.003049,0.002182,0.003119,0.000659,0.001393,...,0.00,0.04,0.00,0.04,0.00,0.00,0.00,0.08,0.00,0.00


In [44]:
#이어붙인 feature matrix의 유사도 행렬 계산
sim_total = pd.DataFrame(cosine_similarity(ft_matrix_q), index=ft_matrix_q.index, columns=ft_matrix_q.index)
col = sim_total.columns.astype(str).str.zfill(5)
sim_total.columns = col
sim_total.index = col

#### 성능 확인

In [45]:
def predict_and_evaluate(func, rating_matrix, sim_df, valid_df_t, threshold=0, k1=1, min_similarity1=0.5, k2=5, min_similarity2=0.5, min_similarity3=0.5, min_similarity4=0.5):
  '''
  아이템 기반 협업 필터링 결과를 확인하는 함수.
  평가 지표 : 혼동 행렬, 정확도, f1-score

  func : 아이템 협업 필터링을 위한 베이스 함수
  rating_matrix : 유틸리티 행렬
  sim_df : feature 유사도 행렬
  valid_df_t : 검증 데이터
  threshold : 선호도 판단 기준
  k : 유사도 선택 개수
  '''

  # 모든 예측 수행
  if func.__name__ == 'predict':
    pred_list = [func(valid_df_t.iloc[i, 0], str(valid_df_t.iloc[i, 2]), threshold = threshold) for i in range(len(valid_df_t))]

  elif func.__name__ == 'predict_v2_1':#2번 변형
    pred_list = [func(valid_df_t.iloc[i, 0], str(valid_df_t.iloc[i, 2]), threshold = threshold, k1 = k1, min_similarity1 = min_similarity1, min_similarity2 = min_similarity2, k2 = k2) for i in range(len(valid_df_t))]

  elif func.__name__ == 'predict_v3':#3번
    pred_list = [func(valid_df_t.iloc[i, 0], str(valid_df_t.iloc[i, 2]), threshold = threshold, k1 = k1, k2 = k2, min_similarity1 = min_similarity1, min_similarity2 = min_similarity2, min_similarity3 = min_similarity3, min_similarity4 = min_similarity4) for i in range(len(valid_df_t))]

  # 예측 결과 저장
  valid_df_t['predict'] = pred_list

  # 평가 지표 계산
  conf_matrix = confusion_matrix(valid_df_t['rating'], valid_df_t['predict'])
  acc = accuracy_score(valid_df_t['rating'], valid_df_t['predict'])
  rcs = recall_score(valid_df_t['rating'], valid_df_t['predict'],average='binary')
  pcs = precision_score(valid_df_t['rating'], valid_df_t['predict'])
  f1 = f1_score(valid_df_t['rating'], valid_df_t['predict'], average='binary')

  # # 결과 출력 - 하이퍼 파라미터 튜닝 중에는 잠시 출력 정지
  print("Confusion Matrix:\n", pd.DataFrame(conf_matrix, index=['N', 'P'], columns=['N', 'P']))
  print(f"Accuracy: {acc:.4f}")
  # print(f"Recall: {rcs:.4f}")
  # print(f"Precision: {pcs:.4f}")
  # print(f"F1-Score: {f1:.4f}")

  return acc, rcs, pcs, f1

##### 1번 Base 함수 (이미지 + 설문 Feature vector 사용)

In [46]:
def predict(user_id, item_id, threshold):
  '''
  feature 유사도 행렬을 이용해서 아이템 기반 협업 필터링을 수행하는 기본적인 함수.

  user_id : 사용자 식별 ID
  item_id : 아이템 식별 ID
  threshold : 선호도 판단 기준

  rating_matrix : 유틸리티 행렬
  sim_df : feature 유사도 행렬
  '''
  # 유저가 평가한 아이템의 선호도 1 또는 0 또는 Nan
  user_ratings = rating_matrix.loc[user_id].dropna()

  # 해당 아이템과 평가된 아이템 간 유사도
  similarities = sim_total[item_id].loc[user_ratings.index]

  if item_id in similarities.index: similarities.drop(index=item_id)

  valid_indices = similarities.index

  # 유사도와 평점의 내적 계산
  prediction = np.dot(similarities, user_ratings) / similarities.sum()

  # 임계값 기준으로 이진 분류 (1 또는 0)
  return 1 if prediction >= threshold else -1

predict_and_evaluate(predict, rating_matrix, sim_total, valid_df_t)

Confusion Matrix:
      N    P
N  569   91
P  239  203
Accuracy: 0.7005


(0.7005444646098004,
 0.4592760180995475,
 0.6904761904761905,
 0.5516304347826086)

##### 2번 함수 (이미지 + 설문 Feature vector 사용)

In [47]:
def predict_v2_1(user_id, item_id, threshold, k1, min_similarity1, min_similarity2, k2):
      '''
      위에 버전에서 min_similarity = 0.92에 대해 k1 = 1 인 아이템 찾음.
      나머지 아이템 들에 대해서는 min_similarity2, k2 = k2를 적용함.
       이게 현재 best

      user_id : 사용자 식별 ID
      item_id : 아이템 식별 ID
      threshold : 선호도 판단 기준
      k1: 높은 유사도 선택 개수
      k2 : 유사도 선택 개수
      min_similarity1 : 최소 유사도 값
      min_similarity2: min_similarity를 만족시키지 못한 아이템들의 차선 similarity
      rating_matrix : 유틸리티 행렬
      sim_df : feature 유사도 행렬
          '''
      user_ratings = rating_matrix.loc[user_id].dropna()
      similarities = sim_total[item_id].loc[user_ratings.index]

      valid_indices_pri = similarities[similarities >= min_similarity1].nlargest(k1).index
      valid_indices = similarities[similarities >= min_similarity2].nlargest(k2).index

      #유사도가 0.92 이상인게 있으면,k1 = 1 사용
      if len(valid_indices_pri) > 0:
          # 유사도 0.92 이상을 만족하는 상위 1개 아이템의 유사도와 평점을 사용, 정확도 매우 좋음
          filtered_similarities = similarities[valid_indices_pri]
          filtered_ratings = user_ratings[valid_indices_pri]


      #유사도가 0.92이상인게 없을 때,min_similarity2 기준으로 상위 k개 선택
      elif len(valid_indices)>0:

          filtered_similarities = similarities[valid_indices]
          filtered_ratings = user_ratings[valid_indices]
          weighted_sum = np.dot(filtered_similarities, filtered_ratings)
          prediction = weighted_sum / filtered_similarities.sum()

      else:
          # 유사도가 충분하지 않아 예측 불가
          filtered_ratings = user_ratings.values
          filtered_similarities = similarities


      weighted_sum = np.dot(filtered_similarities, filtered_ratings)
      prediction = weighted_sum / filtered_similarities.sum()

      # 임계값 기준으로 이진 분류 (-1 또는 1)

      return 1 if prediction >= threshold else -1

predict_and_evaluate(predict_v2_1, rating_matrix, sim_total, valid_df_t, threshold= -0.1, k1=1, min_similarity1 = 0.91, min_similarity2 = 0.47, k2 = 23)

Confusion Matrix:
      N    P
N  594   66
P   65  377
Accuracy: 0.8811


(0.8811252268602541,
 0.8529411764705882,
 0.8510158013544018,
 0.8519774011299435)

###### Grid Search 최적화

In [None]:
min_1 = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98]
threshold = [-0.1, 0, 0.1]
# 최적의 Accuracy와 하이퍼파라미터를 저장할 변수 초기화
best_f1 = 0
best_acc = 0
best_k2 = None
best_threshold = None

for threshold in threshold:
    for min_similarity1 in min_1:
      # 성능 평가
      acc, rcs, pcs, f1 = predict_and_evaluate(predict_v2_1, rating_matrix,
                                sim_total, valid_df_t, threshold= threshold
                                , k1=1, min_similarity1 = min_similarity1, min_similarity2 = 0.6, k2 = 37)

      # 현재 accuracy가 최고값보다 높으면 업데이트
      if acc > best_acc:
          best_f1 = f1
          best_acc = acc
          best_threshold = threshold
          best_min1 = min_similarity1


# 최적의 threshold와 min_similarity출력
print(f"Best F1-Score: {best_f1}")
print(f"Best Accuracy: {best_acc}")
print(f"Best threshold: {best_threshold}")
print(f"Best min_similarity1: {best_min1}")

Accuracy: 0.8857
Accuracy: 0.8838
Accuracy: 0.8838
Accuracy: 0.8848
Accuracy: 0.8848
Accuracy: 0.8848
Accuracy: 0.8848
Accuracy: 0.8857
Accuracy: 0.8838
Accuracy: 0.8757
Accuracy: 0.8739
Accuracy: 0.8739
Accuracy: 0.8748
Accuracy: 0.8748
Accuracy: 0.8739
Accuracy: 0.8739
Accuracy: 0.8748
Accuracy: 0.8739
Accuracy: 0.8639
Accuracy: 0.8612
Accuracy: 0.8603
Accuracy: 0.8603
Accuracy: 0.8603
Accuracy: 0.8593
Accuracy: 0.8603
Accuracy: 0.8612
Accuracy: 0.8603
Best F1-Score: 0.8393063583815029
Best Accuracy: 0.8738656987295825
Best threshold: -0.1
Best min_similarity1: 0.9


In [None]:
min_2 = [0.5,0.55,0.6,0.65,0.7,0.75]
k2_values = [10,20,30,40,50]
threshold = [-0.1,0,0.1]
# 최적의 Accuracy와 하이퍼파라미터를 저장할 변수 초기화
best_f1 = 0
best_acc = 0
best_k2 = None
best_threshold = None

for threshold in threshold:
  for k2 in k2_values:
      for min_similarity2 in min_2:
        # 성능 평가
        acc, rcs, pcs, f1 = predict_and_evaluate(predict_v2_1, rating_matrix,
                                  sim_total, valid_df_t, threshold= threshold
                                  , k1=1, min_similarity1 = 0.9, min_similarity2 = min_similarity2, k2 = k2)

        # 현재 Accuracy가 최고값보다 높으면 업데이트
        if acc > best_acc:
            best_f1 = f1
            best_acc = acc
            best_k2 = k2
            best_min2 = min_similarity2
            best_threshold = threshold


# 최적의 min_similarity2, k, threshold 출력
print(f"Best F1-Score: {best_f1}")
print(f"Best Accuracy: {best_acc}")
print(f"Best k2: {best_k2}")
print(f"Best min_similarity2: {best_min2}")
print(f"Best threshold: {best_threshold}")

Confusion Matrix:
      N    P
N  595   65
P   64  378
Accuracy: 0.8829
Recall: 0.8552
Precision: 0.8533
F1-Score: 0.8542
Confusion Matrix:
      N    P
N  596   64
P   61  381
Accuracy: 0.8866
Recall: 0.8620
Precision: 0.8562
F1-Score: 0.8591
Confusion Matrix:
      N    P
N  595   65
P   64  378
Accuracy: 0.8829
Recall: 0.8552
Precision: 0.8533
F1-Score: 0.8542
Confusion Matrix:
      N    P
N  589   71
P   63  379
Accuracy: 0.8784
Recall: 0.8575
Precision: 0.8422
F1-Score: 0.8498
Confusion Matrix:
      N    P
N  589   71
P   63  379
Accuracy: 0.8784
Recall: 0.8575
Precision: 0.8422
F1-Score: 0.8498
Confusion Matrix:
      N    P
N  584   76
P   66  376
Accuracy: 0.8711
Recall: 0.8507
Precision: 0.8319
F1-Score: 0.8412
Confusion Matrix:
      N    P
N  597   63
P   63  379
Accuracy: 0.8857
Recall: 0.8575
Precision: 0.8575
F1-Score: 0.8575
Confusion Matrix:
      N    P
N  597   63
P   57  385
Accuracy: 0.8911
Recall: 0.8710
Precision: 0.8594
F1-Score: 0.8652
Confusion Matrix:
      

In [None]:
min_2 = [0.45,0.46,0.47,0.48,0.49,0.5,0.51,0.52,0.53,0.54,0.55,0.56,0.57,0.58,0.59,0.6,0.61,0.62,0.63,0.64,0.65]
k2_values = [10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]
thresholds = [-0.1,0,0.1]

# 최적의 Accuracy와 하이퍼파라미터를 저장할 변수 초기화
best_f1 = 0
best_acc = 0
best_k2 = None
best_threshold = None

for threshold in thresholds:
  for k2 in k2_values:
      for min_similarity2 in min_2:
        # 성능 평가
        acc, rcs, pcs, f1 = predict_and_evaluate(predict_v2_1, rating_matrix,
                                  sim_total, valid_df_t, threshold= threshold
                                  , k1=1, min_similarity1 = 0.91, min_similarity2 = min_similarity2, k2 = k2)

        # 현재 accuracy가 최고값보다 높으면 업데이트
        if acc > best_acc:
            best_f1 = f1
            best_acc = acc
            best_k2 = k2
            best_min2 = min_similarity2
            best_threshold = threshold


# 최적의 threshold, k, min_similarity 출력
print(f"Best F1-Score: {best_f1}")
print(f"Best Accuracy: {best_acc}")
print(f"Best k2: {best_k2}")
print(f"Best min_similarity2: {best_min2}")
print(f"Best threshold: {best_threshold}")

Accuracy: 0.8811
Accuracy: 0.8820
Accuracy: 0.8802
Accuracy: 0.8793
Accuracy: 0.8802
Accuracy: 0.8811
Accuracy: 0.8811
Accuracy: 0.8802
Accuracy: 0.8829
Accuracy: 0.8838
Accuracy: 0.8848
Accuracy: 0.8820
Accuracy: 0.8793
Accuracy: 0.8775
Accuracy: 0.8802
Accuracy: 0.8811
Accuracy: 0.8793
Accuracy: 0.8730
Accuracy: 0.8748
Accuracy: 0.8730
Accuracy: 0.8766
Accuracy: 0.8838
Accuracy: 0.8848
Accuracy: 0.8829
Accuracy: 0.8829
Accuracy: 0.8829
Accuracy: 0.8857
Accuracy: 0.8857
Accuracy: 0.8838
Accuracy: 0.8857
Accuracy: 0.8866
Accuracy: 0.8875
Accuracy: 0.8848
Accuracy: 0.8820
Accuracy: 0.8802
Accuracy: 0.8802
Accuracy: 0.8811
Accuracy: 0.8793
Accuracy: 0.8721
Accuracy: 0.8739
Accuracy: 0.8721
Accuracy: 0.8757
Accuracy: 0.8784
Accuracy: 0.8802
Accuracy: 0.8784
Accuracy: 0.8793
Accuracy: 0.8793
Accuracy: 0.8820
Accuracy: 0.8829
Accuracy: 0.8811
Accuracy: 0.8838
Accuracy: 0.8848
Accuracy: 0.8857
Accuracy: 0.8838
Accuracy: 0.8820
Accuracy: 0.8802
Accuracy: 0.8802
Accuracy: 0.8811
Accuracy: 0.87

### **Valid 설문 결과 사용한 최적 모델 성능**

In [48]:
predict_and_evaluate(predict_v2_1, rating_matrix, sim_total, valid_df_t, threshold= -0.1, k1=1, min_similarity1 = 0.91, min_similarity2 = 0.55, k2 = 13)

Confusion Matrix:
      N    P
N  596   64
P   55  387
Accuracy: 0.8920


(0.8920145190562614,
 0.8755656108597285,
 0.8580931263858093,
 0.8667413213885778)