In [12]:
import pandas as pd
import json

# user.json 파일 읽기
with open('./user.json', 'r', encoding='utf-8') as file:
    user_data = json.load(file)
    user_ids = [user['userId'] for user in user_data]

# api.json 파일 읽기
with open('./api.json', 'r', encoding='utf-8') as file:
    api_data = json.load(file)
    restaurant_ids = [restaurant['id'] for restaurant in api_data['content']]

# 유저-레스토랑 평점 매트릭스 생성 (초기값은 NaN으로 설정)
df = pd.DataFrame(index=user_ids, columns=restaurant_ids)

# df 출력하여 구조 확인
print(df)


       19   14   20   12   26
2001  NaN  NaN  NaN  NaN  NaN
2002  NaN  NaN  NaN  NaN  NaN
2003  NaN  NaN  NaN  NaN  NaN
2004  NaN  NaN  NaN  NaN  NaN
2005  NaN  NaN  NaN  NaN  NaN
2006  NaN  NaN  NaN  NaN  NaN
2007  NaN  NaN  NaN  NaN  NaN


In [19]:
import pandas as pd
import json

# JSON 파일 읽기
with open('./reviewlog.json', 'r', encoding='utf-8') as file:
    review_data = json.load(file)

# 데이터프레임 생성
# 중첩된 구조에서 필요한 필드를 분리하여 DataFrame으로 만듦
review_df = pd.DataFrame([{
    '리뷰 ID': review['id'],
    '작성일시': review['createdAt'],
    '평점': review['rating'],
    '리뷰 내용': review['content'],
    '레스토랑 ID': review['restaurantId'],
    '작성자 ID': review['author']['id'],
    '작성자 이름': review['author']['username'],
    '작성자 이메일': review['author']['email']
} for review in review_data])

# 작성일시를 datetime 형식으로 변환
review_df['작성일시'] = pd.to_datetime(review_df['작성일시'])

review_df

Unnamed: 0,리뷰 ID,작성일시,평점,리뷰 내용,레스토랑 ID,작성자 ID,작성자 이름,작성자 이메일
0,1,2024-10-27 15:30:00,4.5,The food was great and the service was excellent.,20,2001,user121,user123@example.com
1,2,2024-10-27 15:50:00,4.5,The food was great and the service was excellent.,19,2001,user122,user123@example.com
2,3,2024-10-27 15:30:00,1.5,The food was great and the service was excellent.,20,2002,user123,user123@example.com
3,4,2024-10-27 15:30:00,2.5,The food was great and the service was excellent.,20,2003,user124,user123@example.com
4,5,2024-10-27 15:30:00,3.5,The food was great and the service was excellent.,21,2004,user125,user123@example.com


In [20]:
# 유저별 전체 평점 합계와 개수 계산
user_total = review_df.groupby('작성자 ID')['평점'].agg(['sum', 'count']).rename(columns={'sum': 'user_total_rating', 'count': 'user_total_count'})

# 유저별 레스토랑별 평점 합계와 개수 계산
user_restaurant_total = review_df.groupby(['작성자 ID', '레스토랑 ID'])['평점'].agg(['sum', 'count']).rename(columns={'sum': 'user_restaurant_rating', 'count': 'user_restaurant_count'})

# review_df에 merge
review_df = review_df.merge(user_total, on='작성자 ID')
review_df = review_df.merge(user_restaurant_total, on=['작성자 ID', '레스토랑 ID'])

# 이 유저의 다른 레스토랑에 매기는 리뷰 평점의 평균 계산
def calculate_user_other_restaurants_avg(row):
    total_rating = row['user_total_rating'] - row['user_restaurant_rating']
    total_count = row['user_total_count'] - row['user_restaurant_count']
    if total_count > 0:
        return total_rating / total_count
    else:
        return None  # 또는 np.nan

review_df['유저_다른_레스토랑_평균'] = review_df.apply(calculate_user_other_restaurants_avg, axis=1)
# 유저별 레스토랑별 최대 평점 계산
user_restaurant_max = review_df.groupby(['작성자 ID', '레스토랑 ID'])['평점'].max().rename('유저_레스토랑_최고_평점')

# review_df에 merge
review_df = review_df.merge(user_restaurant_max, on=['작성자 ID', '레스토랑 ID'])
from datetime import datetime

# 유저별 레스토랑별 가장 최근 리뷰 날짜 계산
user_restaurant_latest = review_df.groupby(['작성자 ID', '레스토랑 ID'])['작성일시'].max().rename('유저_레스토랑_최근_리뷰일')

# review_df에 merge
review_df = review_df.merge(user_restaurant_latest, on=['작성자 ID', '레스토랑 ID'])

# 현재 시간과의 차이 계산
current_time = datetime.now()
review_df['유저_레스토랑_최근_리뷰_경과시간(일)'] = (current_time - review_df['유저_레스토랑_최근_리뷰일']).dt.total_seconds() / (60 * 60 * 24)
# 이미 계산된 'user_restaurant_count' 컬럼 사용
review_df['유저_레스토랑_리뷰_개수'] = review_df['user_restaurant_count']
#작성자 ID와 레스토랑 ID를 기준으로 가장 최근의 행만 남김
review_df = review_df.sort_values('작성일시', ascending=False).drop_duplicates(subset=['작성자 ID', '레스토랑 ID'])

review_df

Unnamed: 0,리뷰 ID,작성일시,평점,리뷰 내용,레스토랑 ID,작성자 ID,작성자 이름,작성자 이메일,user_total_rating,user_total_count,user_restaurant_rating,user_restaurant_count,유저_다른_레스토랑_평균,유저_레스토랑_최고_평점,유저_레스토랑_최근_리뷰일,유저_레스토랑_최근_리뷰_경과시간(일),유저_레스토랑_리뷰_개수
1,2,2024-10-27 15:50:00,4.5,The food was great and the service was excellent.,19,2001,user122,user123@example.com,9.0,2,4.5,1,4.5,4.5,2024-10-27 15:50:00,5.321111,1
0,1,2024-10-27 15:30:00,4.5,The food was great and the service was excellent.,20,2001,user121,user123@example.com,9.0,2,4.5,1,4.5,4.5,2024-10-27 15:30:00,5.335,1
2,3,2024-10-27 15:30:00,1.5,The food was great and the service was excellent.,20,2002,user123,user123@example.com,1.5,1,1.5,1,,1.5,2024-10-27 15:30:00,5.335,1
3,4,2024-10-27 15:30:00,2.5,The food was great and the service was excellent.,20,2003,user124,user123@example.com,2.5,1,2.5,1,,2.5,2024-10-27 15:30:00,5.335,1
4,5,2024-10-27 15:30:00,3.5,The food was great and the service was excellent.,21,2004,user125,user123@example.com,3.5,1,3.5,1,,3.5,2024-10-27 15:30:00,5.335,1


In [14]:
# review_df의 작성자 ID와 레스토랑 ID를 기준으로 평점 값을 df에 입력
for _, row in review_df.iterrows():
    user_id = row['작성자 ID']
    restaurant_id = row['레스토랑 ID']
    rating = row['평점']
    
    # 해당 user_id와 restaurant_id가 df에 있는지 확인하고 평점 입력
    if user_id in df.index and restaurant_id in df.columns:
        df.at[user_id, restaurant_id] = rating


df

Unnamed: 0,19,14,20,12,26
2001,4.5,,4.5,,
2002,,,1.5,,
2003,,,2.5,,
2004,,,,,
2005,,,,,
2006,,,,,
2007,,,,,


In [5]:
import numpy as np
import pandas as pd

# 설정: 잠재 요인 수, 학습률, 정규화 파라미터, 반복 횟수
latent_features = 100
learning_rate = 0.01
regularization = 0.1
iterations = 100

# 데이터 준비: 결측값을 0으로 대체하고 MF 적용
R = df.fillna(0).values
num_users, num_items = R.shape

# 유저 및 아이템 잠재 요인 행렬 초기화
P = np.random.normal(scale=1./latent_features, size=(num_users, latent_features))
Q = np.random.normal(scale=1./latent_features, size=(num_items, latent_features))

# 손실 함수를 계산하는 함수
def calculate_loss(R, P, Q, non_zero_indices, regularization):
    loss = 0
    for i, j in non_zero_indices:
        # 현재 평점과 예측 평점의 차이 계산
        error = R[i, j] - np.dot(P[i, :], Q[j, :].T)
        loss += error ** 2 + regularization * (np.linalg.norm(P[i, :]) + np.linalg.norm(Q[j, :]))
    return loss

# 실제 평점이 있는 인덱스 추출 (NaN이 아닌 값)
non_zero_indices = [(i, j) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

# Matrix Factorization 학습 과정
for iteration in range(iterations):
    for i, j in non_zero_indices:
        # 평점 예측 및 오류 계산
        error = R[i, j] - np.dot(P[i, :], Q[j, :].T)

        # 유저 및 아이템 잠재 요인 업데이트
        P[i, :] += learning_rate * (error * Q[j, :] - regularization * P[i, :])
        Q[j, :] += learning_rate * (error * P[i, :] - regularization * Q[j, :])

    # 매 반복 후 손실 계산
    if (iteration + 1) % 10 == 0:
        loss = calculate_loss(R, P, Q, non_zero_indices, regularization)
        print(f"Iteration: {iteration + 1}, Loss: {loss:.4f}")

# 최종 예측 평점 행렬 계산
predicted_ratings = np.dot(P, Q.T)

# DataFrame으로 변환
predicted_df = pd.DataFrame(predicted_ratings, index=df.index, columns=df.columns)
print(predicted_df)


Iteration: 10, Loss: 48.8638
Iteration: 20, Loss: 48.2878
Iteration: 30, Loss: 46.2686
Iteration: 40, Loss: 39.8269
Iteration: 50, Loss: 25.1847
Iteration: 60, Loss: 9.4049
Iteration: 70, Loss: 3.2460
Iteration: 80, Loss: 1.9871
Iteration: 90, Loss: 1.6784
Iteration: 100, Loss: 1.5650
            19        14        20        12        26
2001  4.314974 -0.035376  4.475921 -0.000149  0.041042
2002  1.223669 -0.009266  1.323913 -0.001669  0.010904
2003  2.048223 -0.020503  2.212613 -0.003268  0.018667
2004  0.006930  0.000545  0.002651  0.000200 -0.000150
2005 -0.003299 -0.000345 -0.004147 -0.000680  0.000879
2006 -0.014034  0.000305 -0.017789 -0.000438 -0.001244
2007  0.006469  0.000132  0.002881  0.000834  0.000408


In [10]:
import numpy as np
import pandas as pd
from scipy.sparse.linalg import svds

# 결측값을 채운 R 행렬 생성 (평균 중심화)
R = df.fillna(0).values
user_ratings_mean = np.mean(R, axis=1)  # 유저별 평균
R_demeaned = R - user_ratings_mean.reshape(-1, 1)

# R 행렬의 크기에 따라 잠재 요인의 개수 설정
latent_features = min(10, min(R_demeaned.shape) - 1)  # 행렬의 최소 차원보다 작게 설정
U, sigma, Vt = svds(R_demeaned, k=latent_features)

# sigma를 대각 행렬로 변환
sigma = np.diag(sigma)

# 원본 행렬 예측 (평균을 더해 복원)
predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

# DataFrame으로 변환하여 결측값 채운 결과 확인
predicted_df = pd.DataFrame(predicted_ratings, index=df.index, columns=df.columns)
print(predicted_df)


                19            14   20            12            26
2001  4.500000e+00  1.110223e-15  4.5  2.220446e-16  8.881784e-16
2002 -4.440892e-16  2.775558e-16  1.5  5.551115e-17  1.665335e-16
2003 -8.881784e-16  7.216450e-16  2.5  3.885781e-16  5.551115e-16
2004  0.000000e+00  0.000000e+00  0.0  0.000000e+00  0.000000e+00
2005  0.000000e+00  0.000000e+00  0.0  0.000000e+00  0.000000e+00
2006  0.000000e+00  0.000000e+00  0.0  0.000000e+00  0.000000e+00
2007  0.000000e+00  0.000000e+00  0.0  0.000000e+00  0.000000e+00


In [None]:
# hybrid_predicted_df를 CSV 파일로 저장
file_path = "mf_predicted_df.csv"
predicted_df.to_csv(file_path, index=True)  # 인덱스 포함하여 저장

print(f"DataFrame이 로컬에 '{file_path}' 파일로 저장되었습니다.")

In [11]:
!pip install git+https://github.com/gbolmier/funk-svd

Collecting git+https://github.com/gbolmier/funk-svd
  Cloning https://github.com/gbolmier/funk-svd to c:\users\leeyoonseo\appdata\local\temp\pip-req-build-xkyjqil0
  Resolved https://github.com/gbolmier/funk-svd to commit fecc38ea1c2859ef6a6d9af0b7f953e1b693764e
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: funk-svd
  Building wheel for funk-svd (setup.py): started
  Building wheel for funk-svd (setup.py): finished with status 'done'
  Created wheel for funk-svd: filename=funk_svd-0.0.1.dev1-py3-none-any.whl size=9111 sha256=f7c6f6c85c058408c6a0efeef37b5d9fb7c1ae02caeb8b3a34324f3243add565
  Stored in directory: C:\Users\leeyoonseo\AppData\Local\Temp\pip-ephem-wheel-cache-_vni3ipy\wheels\f8\93\18\db4114b3fafc2eb9a319db1e3b3c3465db51d1fdc1d4f2e769
Successfully built funk-svd
Installing collected packages: funk-svd
Successfully installed funk-svd-0.0.1.dev1


  Running command git clone --filter=blob:none --quiet https://github.com/gbolmier/funk-svd 'C:\Users\leeyoonseo\AppData\Local\Temp\pip-req-build-xkyjqil0'


In [16]:
import pandas as pd
from funk_svd import SVD
from IPython.display import display


In [17]:
df

Unnamed: 0,19,14,20,12,26
2001,4.5,,4.5,,
2002,,,1.5,,
2003,,,2.5,,
2004,,,,,
2005,,,,,
2006,,,,,
2007,,,,,


In [18]:
import numpy as np
import pandas as pd

# 설정: 잠재 요인 수, 초기 학습률, 정규화 파라미터, 반복 횟수
latent_features = 10
initial_learning_rate = 0.01
regularization = 0.1
iterations = 100

# 유저-아이템 평점 행렬 R 생성
R = df.values  # 유저-아이템 평점 행렬 (결측값 포함)
num_users, num_items = R.shape

# 유저와 아이템 잠재 요인 행렬 초기화
P = np.random.normal(scale=1./latent_features, size=(num_users, latent_features))
Q = np.random.normal(scale=1./latent_features, size=(num_items, latent_features))

# FunkSVD 학습 과정
learning_rate = initial_learning_rate
for iteration in range(iterations):
    for i in range(num_users):
        for j in range(num_items):
            if not np.isnan(R[i, j]):  # 실제 평점이 있는 경우에만 업데이트
                # 예측 평점 및 오류 계산
                prediction = np.dot(P[i, :], Q[j, :].T)
                error = R[i, j] - prediction
                
                # 유저 및 아이템 잠재 요인 업데이트 (정규화 포함)
                P[i, :] += learning_rate * (error * Q[j, :] - regularization * P[i, :])
                Q[j, :] += learning_rate * (error * P[i, :] - regularization * Q[j, :])
                
    # 매 반복 후 손실 계산
    if (iteration + 1) % 10 == 0:
        # 손실 계산 (평점 예측 오류와 정규화 항)
        loss = 0
        for i in range(num_users):
            for j in range(num_items):
                if not np.isnan(R[i, j]):
                    prediction = np.dot(P[i, :], Q[j, :].T)
                    loss += (R[i, j] - prediction) ** 2 + regularization * (np.linalg.norm(P[i, :]) + np.linalg.norm(Q[j, :]))
        print(f"Iteration: {iteration + 1}, Loss: {loss:.4f}")
    
    # 학습률 점진적 감소
    learning_rate *= 0.99

# 예측 평점 행렬 계산
predicted_ratings = np.dot(P, Q.T)

# DataFrame으로 변환하여 결측값 채운 결과 확인
predicted_df = pd.DataFrame(predicted_ratings, index=df.index, columns=df.columns)
print(predicted_df)


Iteration: 10, Loss: 47.8755
Iteration: 20, Loss: 44.4710
Iteration: 30, Loss: 37.7098
Iteration: 40, Loss: 27.0800
Iteration: 50, Loss: 15.8719
Iteration: 60, Loss: 8.1952
Iteration: 70, Loss: 4.4319
Iteration: 80, Loss: 2.8652
Iteration: 90, Loss: 2.2182
Iteration: 100, Loss: 1.9275
            19        14        20        12        26
2001  4.043158 -0.361052  4.378451  0.415411 -0.480454
2002  0.852506 -0.149965  1.206265  0.199948 -0.093282
2003  1.531950 -0.214874  2.035679  0.237769 -0.171903
2004  0.007699 -0.031303  0.031790  0.042058  0.040064
2005 -0.265001  0.024583 -0.153549  0.022368  0.062933
2006 -0.006105  0.021643  0.027866  0.026825 -0.004399
2007  0.101468  0.026805 -0.081309 -0.102576 -0.041903
