In [1]:
import pandas as pd
import json

# JSON 파일 읽기
with open('./kakaolog.json', 'r', encoding='utf-8') as file:
#with open('./reviewlog.json', 'r', encoding='utf-8') as file:
    review_data = json.load(file)

with open('./kakaoapi.json', 'r', encoding='utf-8') as file:
#with open('./api.json', 'r', encoding='utf-8') as file:
    api_data = json.load(file)

# visit_data가 단일 객체가 아닌 리스트인 경우를 대비
if isinstance(review_data, dict):
    review_data = [review_data]

# 필요한 데이터를 추출하기 위해 딕셔너리 형태로 변환 (restaurantId를 키로 설정)
api_content = {item['id']: item for item in api_data['content']}

# 데이터프레임 생성 준비
final_data = []

for review in review_data:
    rest_id = review['restaurantId']
    
    # 해당 restaurantId에 대한 apilog 데이터 가져오기
    if rest_id in api_content:
        api_entry = api_content[rest_id]
        
        # 필요한 데이터 추출
        user_rating = api_entry.get('userRating')
        travel_time = api_entry.get('travelTime')
        rating = api_entry.get('rating')
        
        # 메뉴 최대 3개 추출
        menu_items = api_entry.get('menu', [])
        menu1 = menu_items[0] if len(menu_items) > 0 else None
        menu2 = menu_items[1] if len(menu_items) > 1 else None
        menu3 = menu_items[2] if len(menu_items) > 2 else None
        
        # 카테고리 최대 2개 id 추출
        category_ids = [cat['id'] for cat in api_entry.get('categories', [])]
        category_id1 = category_ids[0] if len(category_ids) > 0 else None
        category_id2 = category_ids[1] if len(category_ids) > 1 else None
        
        # visit 정보와 결합
        final_data.append({
            '작성자 ID': review['author']['id'],
            '레스토랑 ID': rest_id,
            '작성일시': review['createdAt'],
            'reviewRating': review['rating'],
            'reviewText': review['content'],
            'userRating': user_rating,
            'travelTime': travel_time,
            'rating': rating,
            'store_menu1': menu1,
            'store_menu2': menu2,
            'store_menu3': menu3,
            'food_category_id1': category_id1,
            'food_category_id2': category_id2
        })

# 데이터프레임 생성
review_df = pd.DataFrame(final_data)

# 결과 출력
review_df


Unnamed: 0,작성자 ID,레스토랑 ID,작성일시,reviewRating,reviewText,userRating,travelTime,rating,store_menu1,store_menu2,store_menu3,food_category_id1,food_category_id2
0,2001,1,2024-10-14T15:30:00,4.0,평범한 요즘수제버거 맛 파이브가이브까진 절대아님,4.6,0.4,4.6,버거,,,1,
1,2002,1,2024-10-03T15:50:00,3.0,이제는 다른 버거랑 가격차이가 크게 없어서 굳이 먹을 이유가 없어짐. 맛은 없지는 ...,4.6,0.4,4.6,버거,,,1,
2,2003,1,2024-08-25T15:50:00,5.0,여전히 이가격에 여기만한 버거 없어요 파이브가이즈 가지마시고 여기서 배터지게 드세요,4.6,0.4,4.6,버거,,,1,
3,2004,1,2024-08-01T15:50:00,1.0,요즘 초심을 잃은듯... 패티가 점점 얇아지내요...평점은 3점정도 입니다~,4.6,0.4,4.6,버거,,,1,
4,2002,2,2024-10-30T15:50:00,1.0,,2.5,1.5,2.5,피자,,,3,
5,2006,3,2024-11-01T15:50:00,5.0,The food was great and the service was excellent.,4.0,1.9,4.0,오뎅탕,국물닭발,,2,
6,2007,3,2024-10-27T15:50:00,5.0,,4.0,1.9,4.0,오뎅탕,국물닭발,,2,


In [2]:
# 유저별 전체 평점 합계와 개수 계산
user_total = review_df.groupby('작성자 ID')['reviewRating'].agg(['sum', 'count']).rename(columns={'sum': 'user_total_rating', 'count': 'user_total_count'})

# 유저별 레스토랑별 평점 합계와 개수 계산
user_restaurant_total = review_df.groupby(['작성자 ID', '레스토랑 ID'])['reviewRating'].agg(['sum', 'count']).rename(columns={'sum': 'user_restaurant_rating', 'count': 'user_restaurant_count'})

# review_df에 merge
review_df = review_df.merge(user_total, on='작성자 ID')
review_df = review_df.merge(user_restaurant_total, on=['작성자 ID', '레스토랑 ID'])

# 이 유저의 다른 레스토랑에 매기는 리뷰 평점의 평균 계산
def calculate_user_other_restaurants_avg(row):
    total_rating = row['user_total_rating'] - row['user_restaurant_rating']
    total_count = row['user_total_count'] - row['user_restaurant_count']
    if total_count > 0:
        return total_rating / total_count
    else:
        return None  # 또는 np.nan

review_df['유저_다른_레스토랑_평균'] = review_df.apply(calculate_user_other_restaurants_avg, axis=1)
# 유저별 레스토랑별 최대 평점 계산
user_restaurant_max = review_df.groupby(['작성자 ID', '레스토랑 ID'])['reviewRating'].max().rename('유저_레스토랑_최고_평점')

# review_df에 merge
review_df = review_df.merge(user_restaurant_max, on=['작성자 ID', '레스토랑 ID'])
from datetime import datetime

# 유저별 레스토랑별 가장 최근 리뷰 날짜 계산
user_restaurant_latest = review_df.groupby(['작성자 ID', '레스토랑 ID'])['작성일시'].max().rename('유저_레스토랑_최근_리뷰일')

# review_df에 merge
review_df = review_df.merge(user_restaurant_latest, on=['작성자 ID', '레스토랑 ID'])

# '유저_레스토랑_최근_리뷰일'을 datetime 형식으로 변환
review_df['유저_레스토랑_최근_리뷰일'] = pd.to_datetime(review_df['유저_레스토랑_최근_리뷰일'])

# 현재 시간과의 차이 계산
current_time = datetime.now()
review_df['유저_레스토랑_최근_리뷰_경과시간(일)'] = (current_time - review_df['유저_레스토랑_최근_리뷰일']).dt.total_seconds() / (60 * 60 * 24)



review_df

Unnamed: 0,작성자 ID,레스토랑 ID,작성일시,reviewRating,reviewText,userRating,travelTime,rating,store_menu1,store_menu2,...,food_category_id1,food_category_id2,user_total_rating,user_total_count,user_restaurant_rating,user_restaurant_count,유저_다른_레스토랑_평균,유저_레스토랑_최고_평점,유저_레스토랑_최근_리뷰일,유저_레스토랑_최근_리뷰_경과시간(일)
0,2001,1,2024-10-14T15:30:00,4.0,평범한 요즘수제버거 맛 파이브가이브까진 절대아님,4.6,0.4,4.6,버거,,...,1,,4.0,1,4.0,1,,4.0,2024-10-14 15:30:00,22.894341
1,2002,1,2024-10-03T15:50:00,3.0,이제는 다른 버거랑 가격차이가 크게 없어서 굳이 먹을 이유가 없어짐. 맛은 없지는 ...,4.6,0.4,4.6,버거,,...,1,,4.0,2,3.0,1,1.0,3.0,2024-10-03 15:50:00,33.880452
2,2003,1,2024-08-25T15:50:00,5.0,여전히 이가격에 여기만한 버거 없어요 파이브가이즈 가지마시고 여기서 배터지게 드세요,4.6,0.4,4.6,버거,,...,1,,5.0,1,5.0,1,,5.0,2024-08-25 15:50:00,72.880452
3,2004,1,2024-08-01T15:50:00,1.0,요즘 초심을 잃은듯... 패티가 점점 얇아지내요...평점은 3점정도 입니다~,4.6,0.4,4.6,버거,,...,1,,1.0,1,1.0,1,,1.0,2024-08-01 15:50:00,96.880452
4,2002,2,2024-10-30T15:50:00,1.0,,2.5,1.5,2.5,피자,,...,3,,4.0,2,1.0,1,3.0,1.0,2024-10-30 15:50:00,6.880452
5,2006,3,2024-11-01T15:50:00,5.0,The food was great and the service was excellent.,4.0,1.9,4.0,오뎅탕,국물닭발,...,2,,5.0,1,5.0,1,,5.0,2024-11-01 15:50:00,4.880452
6,2007,3,2024-10-27T15:50:00,5.0,,4.0,1.9,4.0,오뎅탕,국물닭발,...,2,,5.0,1,5.0,1,,5.0,2024-10-27 15:50:00,9.880452


In [3]:
import pandas as pd
import numpy as np

import pandas as pd
import json

# user.json 파일 읽기
with open('./kakaouser.json', 'r', encoding='utf-8') as file:
    user_data = json.load(file)
    user_ids = [user['userId'] for user in user_data]

# user_ids에 있는 아이디가 review_df에 없다면 결측치 행을 추가
missing_user_ids = [user_id for user_id in user_ids if user_id not in review_df['작성자 ID'].unique()]

# 결측치 행을 추가하기 위한 DataFrame 생성
missing_rows = pd.DataFrame({
    '작성자 ID': missing_user_ids,
    **{col: np.nan for col in review_df.columns if col != '작성자 ID'}
})

# review_df에 결측치 행 추가 및 작성자 ID로 정렬
review_df = pd.concat([review_df, missing_rows], ignore_index=True)
review_df = review_df.sort_values(by='작성자 ID').reset_index(drop=True)


review_df


  review_df = pd.concat([review_df, missing_rows], ignore_index=True)


Unnamed: 0,작성자 ID,레스토랑 ID,작성일시,reviewRating,reviewText,userRating,travelTime,rating,store_menu1,store_menu2,...,food_category_id1,food_category_id2,user_total_rating,user_total_count,user_restaurant_rating,user_restaurant_count,유저_다른_레스토랑_평균,유저_레스토랑_최고_평점,유저_레스토랑_최근_리뷰일,유저_레스토랑_최근_리뷰_경과시간(일)
0,2001.0,1.0,2024-10-14T15:30:00,4.0,평범한 요즘수제버거 맛 파이브가이브까진 절대아님,4.6,0.4,4.6,버거,,...,1.0,,4.0,1.0,4.0,1.0,,4.0,2024-10-14 15:30:00,22.894341
1,2002.0,1.0,2024-10-03T15:50:00,3.0,이제는 다른 버거랑 가격차이가 크게 없어서 굳이 먹을 이유가 없어짐. 맛은 없지는 ...,4.6,0.4,4.6,버거,,...,1.0,,4.0,2.0,3.0,1.0,1.0,3.0,2024-10-03 15:50:00,33.880452
2,2002.0,2.0,2024-10-30T15:50:00,1.0,,2.5,1.5,2.5,피자,,...,3.0,,4.0,2.0,1.0,1.0,3.0,1.0,2024-10-30 15:50:00,6.880452
3,2003.0,1.0,2024-08-25T15:50:00,5.0,여전히 이가격에 여기만한 버거 없어요 파이브가이즈 가지마시고 여기서 배터지게 드세요,4.6,0.4,4.6,버거,,...,1.0,,5.0,1.0,5.0,1.0,,5.0,2024-08-25 15:50:00,72.880452
4,2004.0,1.0,2024-08-01T15:50:00,1.0,요즘 초심을 잃은듯... 패티가 점점 얇아지내요...평점은 3점정도 입니다~,4.6,0.4,4.6,버거,,...,1.0,,1.0,1.0,1.0,1.0,,1.0,2024-08-01 15:50:00,96.880452
5,2006.0,3.0,2024-11-01T15:50:00,5.0,The food was great and the service was excellent.,4.0,1.9,4.0,오뎅탕,국물닭발,...,2.0,,5.0,1.0,5.0,1.0,,5.0,2024-11-01 15:50:00,4.880452
6,2007.0,3.0,2024-10-27T15:50:00,5.0,,4.0,1.9,4.0,오뎅탕,국물닭발,...,2.0,,5.0,1.0,5.0,1.0,,5.0,2024-10-27 15:50:00,9.880452


In [4]:
import pandas as pd
import numpy as np
import json

# user.json 파일 읽기
with open('./kakaouser.json', 'r', encoding='utf-8') as file:
#with open('./user.json', 'r', encoding='utf-8') as file:
    user_data = json.load(file)
    user_ids = [user['userId'] for user in user_data]

# api.json 파일 읽기
with open('./kakaoapi.json', 'r', encoding='utf-8') as file:
#with open('./api.json', 'r', encoding='utf-8') as file:
    restaurant_data = json.load(file)
    # 'content' 키를 통해 레스토랑 리스트에 접근
    restaurant_ids = [restaurant['id'] for restaurant in restaurant_data['content']]

# 유저-아이템 모든 조합 생성
all_combinations = pd.MultiIndex.from_product([user_ids, restaurant_ids], names=['작성자 ID', '레스토랑 ID']).to_frame(index=False)

# 기존 review_df를 모든 조합에 맞춰 결합하고, 없는 조합은 결측치로 채우기
expanded_review_df = pd.merge(all_combinations, review_df, on=['작성자 ID', '레스토랑 ID'], how='left')

# 결과 출력
review_df = expanded_review_df.sort_values(by=['작성자 ID', '레스토랑 ID']).reset_index(drop=True)
review_df


Unnamed: 0,작성자 ID,레스토랑 ID,작성일시,reviewRating,reviewText,userRating,travelTime,rating,store_menu1,store_menu2,...,food_category_id1,food_category_id2,user_total_rating,user_total_count,user_restaurant_rating,user_restaurant_count,유저_다른_레스토랑_평균,유저_레스토랑_최고_평점,유저_레스토랑_최근_리뷰일,유저_레스토랑_최근_리뷰_경과시간(일)
0,2001,1,2024-10-14T15:30:00,4.0,평범한 요즘수제버거 맛 파이브가이브까진 절대아님,4.6,0.4,4.6,버거,,...,1.0,,4.0,1.0,4.0,1.0,,4.0,2024-10-14 15:30:00,22.894341
1,2001,2,,,,,,,,,...,,,,,,,,,NaT,
2,2001,3,,,,,,,,,...,,,,,,,,,NaT,
3,2002,1,2024-10-03T15:50:00,3.0,이제는 다른 버거랑 가격차이가 크게 없어서 굳이 먹을 이유가 없어짐. 맛은 없지는 ...,4.6,0.4,4.6,버거,,...,1.0,,4.0,2.0,3.0,1.0,1.0,3.0,2024-10-03 15:50:00,33.880452
4,2002,2,2024-10-30T15:50:00,1.0,,2.5,1.5,2.5,피자,,...,3.0,,4.0,2.0,1.0,1.0,3.0,1.0,2024-10-30 15:50:00,6.880452
5,2002,3,,,,,,,,,...,,,,,,,,,NaT,
6,2003,1,2024-08-25T15:50:00,5.0,여전히 이가격에 여기만한 버거 없어요 파이브가이즈 가지마시고 여기서 배터지게 드세요,4.6,0.4,4.6,버거,,...,1.0,,5.0,1.0,5.0,1.0,,5.0,2024-08-25 15:50:00,72.880452
7,2003,2,,,,,,,,,...,,,,,,,,,NaT,
8,2003,3,,,,,,,,,...,,,,,,,,,NaT,
9,2004,1,2024-08-01T15:50:00,1.0,요즘 초심을 잃은듯... 패티가 점점 얇아지내요...평점은 3점정도 입니다~,4.6,0.4,4.6,버거,,...,1.0,,1.0,1.0,1.0,1.0,,1.0,2024-08-01 15:50:00,96.880452


In [5]:
review_df.columns

Index(['작성자 ID', '레스토랑 ID', '작성일시', 'reviewRating', 'reviewText', 'userRating',
       'travelTime', 'rating', 'store_menu1', 'store_menu2', 'store_menu3',
       'food_category_id1', 'food_category_id2', 'user_total_rating',
       'user_total_count', 'user_restaurant_rating', 'user_restaurant_count',
       '유저_다른_레스토랑_평균', '유저_레스토랑_최고_평점', '유저_레스토랑_최근_리뷰일',
       '유저_레스토랑_최근_리뷰_경과시간(일)'],
      dtype='object')

In [6]:
review_df.dtypes

작성자 ID                             int64
레스토랑 ID                            int64
작성일시                              object
reviewRating                     float64
reviewText                        object
userRating                       float64
travelTime                       float64
rating                           float64
store_menu1                       object
store_menu2                       object
store_menu3                       object
food_category_id1                float64
food_category_id2                 object
user_total_rating                float64
user_total_count                 float64
user_restaurant_rating           float64
user_restaurant_count            float64
유저_다른_레스토랑_평균                    float64
유저_레스토랑_최고_평점                    float64
유저_레스토랑_최근_리뷰일            datetime64[ns]
유저_레스토랑_최근_리뷰_경과시간(일)            float64
dtype: object

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Assuming `review_df` is already loaded

# Fill NaNs in `reviewRating` with the mean rating
review_df['reviewRating'].fillna(review_df['reviewRating'].mean(), inplace=True)

# Encode '작성자 ID' and '레스토랑 ID' as integer indices for embedding layers
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

review_df['user_idx'] = user_encoder.fit_transform(review_df['작성자 ID'])
review_df['item_idx'] = item_encoder.fit_transform(review_df['레스토랑 ID'])

num_users = review_df['user_idx'].nunique()
num_items = review_df['item_idx'].nunique()

# Process additional features
# Let's select some features to include
categorical_features = ['store_menu1', 'store_menu2', 'store_menu3', 'food_category_id1']
numerical_features = ['userRating', 'travelTime', 'rating', 'user_total_rating',
                      'user_total_count', 'user_restaurant_rating', 'user_restaurant_count',
                      '유저_다른_레스토랑_평균', '유저_레스토랑_최고_평점', '유저_레스토랑_최근_리뷰_경과시간(일)']

# Fill missing values for categorical features with 'Unknown' and numerical features with mean
for col in categorical_features:
    review_df[col] = review_df[col].fillna('Unknown')

for col in numerical_features:
    review_df[col] = review_df[col].fillna(review_df[col].mean())

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    review_df[col] = le.fit_transform(review_df[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = MinMaxScaler()
review_df[numerical_features] = scaler.fit_transform(review_df[numerical_features])

# Prepare the dataset
class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = df['user_idx'].values
        self.items = df['item_idx'].values
        self.ratings = df['reviewRating'].values.astype(np.float32)
        self.cat_features = df[categorical_features].values.astype(np.int64)
        self.num_features = df[numerical_features].values.astype(np.float32)
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return (self.users[idx],
                self.items[idx],
                self.cat_features[idx],
                self.num_features[idx],
                self.ratings[idx])

# Split data
train_df, test_df = train_test_split(review_df, test_size=0.2, random_state=42)
train_dataset = RatingDataset(train_df)
test_dataset = RatingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# NCF Model with additional features
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, mlp_hidden_layers,
                 num_cat_features, num_cat_unique_values, num_num_features):
        super(NCF, self).__init__()
        # Embeddings for users and items
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Embeddings for categorical features
        self.cat_embedding_layers = nn.ModuleList([
            nn.Embedding(num_unique, embedding_dim) for num_unique in num_cat_unique_values
        ])
        
        # Linear layer for numerical features
        self.num_linear = nn.Linear(num_num_features, embedding_dim)
        
        # MLP layers
        mlp_input_size = embedding_dim * (2 + len(num_cat_unique_values) + 1)  # user, item, cat features, num features
        mlp_layers = []
        for layer_size in mlp_hidden_layers:
            mlp_layers.append(nn.Linear(mlp_input_size, layer_size))
            mlp_layers.append(nn.ReLU())
            mlp_input_size = layer_size
        self.mlp = nn.Sequential(*mlp_layers)
        
        # Final prediction layer
        self.predict_layer = nn.Linear(mlp_hidden_layers[-1], 1)
        
    def forward(self, user_idx, item_idx, cat_features, num_features):
        user_emb = self.user_embedding(user_idx)
        item_emb = self.item_embedding(item_idx)
        
        # Process categorical features
        cat_embs = []
        for i, emb_layer in enumerate(self.cat_embedding_layers):
            cat_emb = emb_layer(cat_features[:, i])
            cat_embs.append(cat_emb)
        
        # Process numerical features
        num_emb = self.num_linear(num_features)
        
        # Concatenate all embeddings
        x = torch.cat([user_emb, item_emb] + cat_embs + [num_emb], dim=-1)
        
        # Pass through MLP layers
        x = self.mlp(x)
        
        # Final prediction
        prediction = self.predict_layer(x)
        return prediction.squeeze()
    
# Prepare parameters for the model
embedding_dim = 32
mlp_hidden_layers = [128, 64, 32, 16]

# Get the number of unique values for each categorical feature
num_cat_unique_values = []
for col in categorical_features:
    num_unique = review_df[col].nunique()
    num_cat_unique_values.append(num_unique)

num_num_features = len(numerical_features)

# Instantiate the model
model = NCF(num_users, num_items, embedding_dim, mlp_hidden_layers,
            num_cat_features=len(categorical_features),
            num_cat_unique_values=num_cat_unique_values,
            num_num_features=num_num_features)

# Loss and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training function
def train_model(model, train_loader, criterion, optimizer, epochs=3000):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for user, item, cat_features, num_features, rating in train_loader:
            user = user.long()
            item = item.long()
            cat_features = cat_features.long()
            num_features = num_features.float()
            rating = rating.float()
            
            optimizer.zero_grad()
            prediction = model(user, item, cat_features, num_features)
            loss = criterion(prediction, rating)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Evaluation function
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, item, cat_features, num_features, rating in test_loader:
            user = user.long()
            item = item.long()
            cat_features = cat_features.long()
            num_features = num_features.float()
            rating = rating.float()
            
            prediction = model(user, item, cat_features, num_features)
            loss = criterion(prediction, rating)
            total_loss += loss.item()
    avg_loss = total_loss / len(test_loader)
    print(f"Test Loss: {avg_loss:.4f}")

# Train and evaluate the model
train_model(model, train_loader, criterion, optimizer, epochs=300)
evaluate_model(model, test_loader, criterion)

# Generate the prediction matrix
def generate_prediction_matrix(model, num_users, num_items, cat_features, num_features):
    model.eval()
    prediction_matrix = np.zeros((num_users, num_items))
    with torch.no_grad():
        for user_idx in range(num_users):
            user_tensor = torch.tensor([user_idx] * num_items).long()
            item_tensor = torch.tensor(range(num_items)).long()
            cat_features_tensor = torch.tensor([cat_features] * num_items).long()
            num_features_tensor = torch.tensor([num_features] * num_items).float()
            prediction = model(user_tensor, item_tensor, cat_features_tensor, num_features_tensor)
            prediction_matrix[user_idx] = prediction.cpu().numpy()
    return prediction_matrix

# For simplicity, using mode for categorical and mean for numerical features in prediction
mean_cat_features = review_df[categorical_features].mode().iloc[0].values
mean_num_features = review_df[numerical_features].mean().values

predicted_ratings_matrix = generate_prediction_matrix(
    model,
    num_users,
    num_items,
    mean_cat_features,
    mean_num_features
)

predicted_ratings_df = pd.DataFrame(
    predicted_ratings_matrix, 
    index=user_encoder.inverse_transform(range(num_users)), 
    columns=item_encoder.inverse_transform(range(num_items))
)
predicted_ratings_df


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  review_df['reviewRating'].fillna(review_df['reviewRating'].mean(), inplace=True)


Epoch 1/300, Loss: 10.2319
Epoch 2/300, Loss: 9.9718
Epoch 3/300, Loss: 9.7668
Epoch 4/300, Loss: 9.5512
Epoch 5/300, Loss: 9.2885
Epoch 6/300, Loss: 8.9741
Epoch 7/300, Loss: 8.6208
Epoch 8/300, Loss: 8.2128
Epoch 9/300, Loss: 7.7411
Epoch 10/300, Loss: 7.2013
Epoch 11/300, Loss: 6.5913
Epoch 12/300, Loss: 5.9108
Epoch 13/300, Loss: 5.1646
Epoch 14/300, Loss: 4.3593
Epoch 15/300, Loss: 3.5117
Epoch 16/300, Loss: 2.6541
Epoch 17/300, Loss: 1.8269
Epoch 18/300, Loss: 1.0971
Epoch 19/300, Loss: 0.5542
Epoch 20/300, Loss: 0.2777
Epoch 21/300, Loss: 0.3439
Epoch 22/300, Loss: 0.7154
Epoch 23/300, Loss: 1.1559
Epoch 24/300, Loss: 1.4014
Epoch 25/300, Loss: 1.3682
Epoch 26/300, Loss: 1.1303
Epoch 27/300, Loss: 0.8127
Epoch 28/300, Loss: 0.5195
Epoch 29/300, Loss: 0.3121
Epoch 30/300, Loss: 0.2081
Epoch 31/300, Loss: 0.1933
Epoch 32/300, Loss: 0.2381
Epoch 33/300, Loss: 0.3103
Epoch 34/300, Loss: 0.3826
Epoch 35/300, Loss: 0.4366
Epoch 36/300, Loss: 0.4623
Epoch 37/300, Loss: 0.4570
Epoch 38/

  cat_features_tensor = torch.tensor([cat_features] * num_items).long()


Unnamed: 0,12,14,19,20,26
2001,3.296642,3.305322,3.618095,4.049636,3.303639
2002,3.29483,3.306916,3.28498,3.05143,3.302287
2003,3.293529,3.302354,3.295873,3.280121,3.302335
2004,3.269989,3.290514,3.296001,3.307626,3.283137
2005,3.321509,3.329964,3.319018,3.278927,3.330315
2006,3.304222,3.312755,3.304553,3.296193,3.313028
2007,3.287484,3.298049,3.304088,3.309138,3.29629


In [9]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

embedding_dim = 16  # 사용자와 아이템 임베딩 차원 수
mlp_hidden_layers = [64, 32, 16]  # MLP 레이어 크기 리스트
num_cat_unique_values = [review_df[col].nunique() for col in categorical_features]  # 각 범주형 열의 고유값 수
num_num_features = len(numerical_features)  # 숫자형 특성의 개수


# Fill NaNs in `reviewRating` with the mean rating
review_df['reviewRating'].fillna(review_df['reviewRating'].mean(), inplace=True)

# Encode '작성자 ID' and '레스토랑 ID' as integer indices for embedding layers
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

review_df['user_idx'] = user_encoder.fit_transform(review_df['작성자 ID'])
review_df['item_idx'] = item_encoder.fit_transform(review_df['레스토랑 ID'])

num_users = review_df['user_idx'].nunique()
num_items = review_df['item_idx'].nunique()

# Process additional features
# Let's select some features to include
categorical_features = ['store_menu1', 'store_menu2', 'store_menu3','food_category_id1','reviewText']
numerical_features = ['userRating', 'travelTime', 'rating', 'user_total_rating',
                      'user_total_count', 'user_restaurant_rating', 'user_restaurant_count',
                      '유저_다른_레스토랑_평균', '유저_레스토랑_최고_평점', '유저_레스토랑_최근_리뷰_경과시간(일)']


# 범주형 열을 모두 문자열로 변환하여 타입을 통일
for col in categorical_features:
    review_df[col] = review_df[col].fillna('Unknown').astype(str)  # NaN을 'Unknown'으로 대체하고 문자열로 변환
    le = LabelEncoder()
    review_df[col] = le.fit_transform(review_df[col])
    label_encoders[col] = le


# Fill missing values for categorical features with 'Unknown' and numerical features with mean
for col in categorical_features:
    review_df[col] = review_df[col].fillna('Unknown')

for col in numerical_features:
    review_df[col] = review_df[col].fillna(review_df[col].mean())

# Encode categorical features
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    review_df[col] = le.fit_transform(review_df[col])
    label_encoders[col] = le

# Normalize numerical features
scaler = MinMaxScaler()
review_df[numerical_features] = scaler.fit_transform(review_df[numerical_features])

# Prepare the dataset
class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = df['user_idx'].values
        self.items = df['item_idx'].values
        self.ratings = df['reviewRating'].values.astype(np.float32)
        self.cat_features = df[categorical_features].values.astype(np.int64)
        self.num_features = df[numerical_features].values.astype(np.float32)
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return (self.users[idx],
                self.items[idx],
                self.cat_features[idx],
                self.num_features[idx],
                self.ratings[idx])

# Split data
train_df, test_df = train_test_split(review_df, test_size=0.2, random_state=42)
train_dataset = RatingDataset(train_df)
test_dataset = RatingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 얼리 스토핑을 위한 클래스 정의
class EarlyStopping:
    def __init__(self, patience=5, verbose=False, delta=0):
        self.patience = patience  # 개선되지 않을 때 기다릴 에폭 수
        self.verbose = verbose    # 개선될 때마다 출력 여부
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.delta = delta        # 최소 개선 정도

    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss > self.best_loss - self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        ''' 검증 손실이 개선되면 모델 저장 '''
        if self.verbose:
            print(f'Validation loss decreased ({self.best_loss:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')

# NCF 모델 정의 (이전과 동일)
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim, mlp_hidden_layers,
                 num_cat_features, num_cat_unique_values, num_num_features):
        super(NCF, self).__init__()
        # Embeddings for users and items
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Embeddings for categorical features
        self.cat_embedding_layers = nn.ModuleList([
            nn.Embedding(num_unique, embedding_dim) for num_unique in num_cat_unique_values
        ])
        
        # Linear layer for numerical features
        self.num_linear = nn.Linear(num_num_features, embedding_dim)
        
        # MLP layers
        mlp_input_size = embedding_dim * (2 + len(num_cat_unique_values) + 1)  # user, item, cat features, num features
        mlp_layers = []
        for layer_size in mlp_hidden_layers:
            mlp_layers.append(nn.Linear(mlp_input_size, layer_size))
            mlp_layers.append(nn.ReLU())
            mlp_input_size = layer_size
        self.mlp = nn.Sequential(*mlp_layers)
        
        # Final prediction layer
        self.predict_layer = nn.Linear(mlp_hidden_layers[-1], 1)
        
    def forward(self, user_idx, item_idx, cat_features, num_features):
        user_emb = self.user_embedding(user_idx)
        item_emb = self.item_embedding(item_idx)
        
        # Process categorical features
        cat_embs = []
        for i, emb_layer in enumerate(self.cat_embedding_layers):
            cat_emb = emb_layer(cat_features[:, i])
            cat_embs.append(cat_emb)
        
        # Process numerical features
        num_emb = self.num_linear(num_features)
        
        # Concatenate all embeddings
        x = torch.cat([user_emb, item_emb] + cat_embs + [num_emb], dim=-1)
        
        # Pass through MLP layers
        x = self.mlp(x)
        
        # Final prediction
        prediction = self.predict_layer(x)
        return prediction.squeeze()

# 모델 초기화 및 손실 함수, 옵티마이저 정의
model = NCF(num_users, num_items, embedding_dim, mlp_hidden_layers,
            num_cat_features=len(categorical_features),
            num_cat_unique_values=num_cat_unique_values,
            num_num_features=num_num_features)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 데이터 분할: 훈련, 검증, 테스트 세트
train_df, temp_df = train_test_split(review_df, test_size=0.2, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

train_dataset = RatingDataset(train_df)
valid_dataset = RatingDataset(valid_df)
test_dataset = RatingDataset(test_df)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# 모델 훈련 함수 수정
def train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=50, patience=5):
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for user, item, cat_features, num_features, rating in train_loader:
            user = user.long()
            item = item.long()
            cat_features = cat_features.long()
            num_features = num_features.float()
            rating = rating.float()

            optimizer.zero_grad()
            prediction = model(user, item, cat_features, num_features)
            loss = criterion(prediction, rating)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)

        # 검증 손실 계산
        model.eval()
        valid_loss = 0
        with torch.no_grad():
            for user, item, cat_features, num_features, rating in valid_loader:
                user = user.long()
                item = item.long()
                cat_features = cat_features.long()
                num_features = num_features.float()
                rating = rating.float()

                prediction = model(user, item, cat_features, num_features)
                loss = criterion(prediction, rating)
                valid_loss += loss.item()
        avg_valid_loss = valid_loss / len(valid_loader)

        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_valid_loss:.4f}")

        # 얼리 스토핑 체크
        early_stopping(avg_valid_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break

    # 최적의 모델 로드
    model.load_state_dict(torch.load('checkpoint.pt'))

# 모델 훈련
train_model(model, train_loader, valid_loader, criterion, optimizer, epochs=500, patience=5)

# 테스트 손실 평가
def evaluate_model(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for user, item, cat_features, num_features, rating in test_loader:
            user = user.long()
            item = item.long()
            cat_features = cat_features.long()
            num_features = num_features.float()
            rating = rating.float()

            prediction = model(user, item, cat_features, num_features)
            loss = criterion(prediction, rating)
            total_loss += loss.item()
    avg_loss = total_loss / len(test_loader)
    print(f"Test Loss: {avg_loss:.4f}")

# 모델 평가
evaluate_model(model, test_loader, criterion)

# 예측 행렬 생성 (이전과 동일)
def generate_prediction_matrix(model, num_users, num_items, cat_features, num_features):
    model.eval()
    prediction_matrix = np.zeros((num_users, num_items))
    with torch.no_grad():
        for user_idx in range(num_users):
            user_tensor = torch.tensor([user_idx] * num_items).long()
            item_tensor = torch.tensor(range(num_items)).long()
            cat_features_tensor = torch.tensor([cat_features] * num_items).long()
            num_features_tensor = torch.tensor([num_features] * num_items).float()
            prediction = model(user_tensor, item_tensor, cat_features_tensor, num_features_tensor)
            prediction_matrix[user_idx] = prediction.cpu().numpy()
    return prediction_matrix

# 평균 특성값 사용하여 예측 행렬 생성
mean_cat_features = review_df[categorical_features].mode().iloc[0].values
mean_num_features = review_df[numerical_features].mean().values

predicted_ratings_matrix = generate_prediction_matrix(
    model,
    num_users,
    num_items,
    mean_cat_features,
    mean_num_features
)

predicted_ratings_df = pd.DataFrame(
    predicted_ratings_matrix, 
    index=user_encoder.inverse_transform(range(num_users)), 
    columns=item_encoder.inverse_transform(range(num_items))
)

predicted_ratings_df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  review_df['reviewRating'].fillna(review_df['reviewRating'].mean(), inplace=True)


Epoch 1/500, Training Loss: 11.2418, Validation Loss: 11.9584
Validation loss decreased (11.958359 --> 11.958359).  Saving model ...
Epoch 2/500, Training Loss: 10.9961, Validation Loss: 11.7298
Validation loss decreased (11.729778 --> 11.729778).  Saving model ...
Epoch 3/500, Training Loss: 10.7643, Validation Loss: 11.4947
Validation loss decreased (11.494749 --> 11.494749).  Saving model ...
Epoch 4/500, Training Loss: 10.5282, Validation Loss: 11.2536
Validation loss decreased (11.253635 --> 11.253635).  Saving model ...
Epoch 5/500, Training Loss: 10.2730, Validation Loss: 11.0002
Validation loss decreased (11.000151 --> 11.000151).  Saving model ...
Epoch 6/500, Training Loss: 10.0106, Validation Loss: 10.7296
Validation loss decreased (10.729568 --> 10.729568).  Saving model ...
Epoch 7/500, Training Loss: 9.7276, Validation Loss: 10.4371
Validation loss decreased (10.437067 --> 10.437067).  Saving model ...
Epoch 8/500, Training Loss: 9.4246, Validation Loss: 10.1189
Validatio

  model.load_state_dict(torch.load('checkpoint.pt'))
  cat_features_tensor = torch.tensor([cat_features] * num_items).long()


Unnamed: 0,1,2,3
2001,3.487838,3.582812,3.675187
2002,3.116429,3.224333,3.30111
2003,3.662305,3.808831,3.90766
2004,3.899577,4.021204,4.122964
2006,3.673032,3.820996,3.942516
2007,3.882491,4.030188,4.171573


In [13]:
import numpy as np

# 정답과 예측값
true_values = np.array([3.0, 3.0, 4.0])
predicted_values = np.array([3.116429, 3.224333, 3.301110])

# 값 범위 (1.0 ~ 5.0)
value_range = 5.0 - 1.0

# NMAE 계산
mae = np.mean(np.abs(true_values - predicted_values))
nmae = mae / value_range

# 정확도 계산
accuracy = (1 - nmae) * 100

print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 91.34%


In [None]:
# import numpy as np
# import pandas as pd
# import torch
# import torch.nn as nn
# import torch.optim as optim
# from sklearn.model_selection import train_test_split
# from torch.utils.data import Dataset, DataLoader

# # Assuming `review_df` is already loaded and preprocessed
# # Fill NaNs in `reviewRating` with a neutral value, like the mean rating
# review_df['reviewRating'].fillna(review_df['reviewRating'].mean(), inplace=True)

# # Encode '작성자 ID' and '레스토랑 ID' as integer indices for embedding layers
# user_ids = review_df['작성자 ID'].unique()
# restaurant_ids = review_df['레스토랑 ID'].unique()
# user_id_map = {id: idx for idx, id in enumerate(user_ids)}
# restaurant_id_map = {id: idx for idx, id in enumerate(restaurant_ids)}
# review_df['user_idx'] = review_df['작성자 ID'].map(user_id_map)
# review_df['restaurant_idx'] = review_df['레스토랑 ID'].map(restaurant_id_map)

# # Parameters
# num_users = len(user_ids)
# num_items = len(restaurant_ids)
# embedding_dim = 32  # Embedding size
# mlp_hidden_layers = [64, 32, 16, 8]  # Define MLP layers

# # Split data
# train, test = train_test_split(review_df, test_size=0.2, random_state=42)

# # Dataset class for loading user-item interactions
# class RatingDataset(Dataset):
#     def __init__(self, df):
#         self.users = df['user_idx'].values
#         self.items = df['restaurant_idx'].values
#         self.ratings = df['reviewRating'].values

#     def __len__(self):
#         return len(self.ratings)

#     def __getitem__(self, idx):
#         return self.users[idx], self.items[idx], self.ratings[idx]

# train_dataset = RatingDataset(train)
# test_dataset = RatingDataset(test)

# train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# # NCF Model combining GMF and MLP
# class NCF(nn.Module):
#     def __init__(self, num_users, num_items, embedding_dim, mlp_hidden_layers):
#         super(NCF, self).__init__()
#         # GMF embeddings
#         self.user_embedding_gmf = nn.Embedding(num_users, embedding_dim)
#         self.item_embedding_gmf = nn.Embedding(num_items, embedding_dim)
        
#         # MLP embeddings
#         self.user_embedding_mlp = nn.Embedding(num_users, embedding_dim)
#         self.item_embedding_mlp = nn.Embedding(num_items, embedding_dim)
        
#         # MLP layers
#         mlp_layers = []
#         input_size = 2 * embedding_dim
#         for layer_size in mlp_hidden_layers:
#             mlp_layers.append(nn.Linear(input_size, layer_size))
#             mlp_layers.append(nn.ReLU())
#             input_size = layer_size
#         self.mlp = nn.Sequential(*mlp_layers)
        
#         # Final prediction layer
#         self.predict_layer = nn.Linear(embedding_dim + mlp_hidden_layers[-1], 1)
        
#     def forward(self, user, item):
#         # GMF part
#         gmf_user_emb = self.user_embedding_gmf(user)
#         gmf_item_emb = self.item_embedding_gmf(item)
#         gmf_output = gmf_user_emb * gmf_item_emb
        
#         # MLP part
#         mlp_user_emb = self.user_embedding_mlp(user)
#         mlp_item_emb = self.item_embedding_mlp(item)
#         mlp_input = torch.cat([mlp_user_emb, mlp_item_emb], dim=-1)
#         mlp_output = self.mlp(mlp_input)
        
#         # Concatenate GMF and MLP outputs
#         final_input = torch.cat([gmf_output, mlp_output], dim=-1)
#         prediction = self.predict_layer(final_input)
        
#         return prediction.squeeze()

# # Instantiate the model, loss function, and optimizer
# model = NCF(num_users, num_items, embedding_dim, mlp_hidden_layers)
# criterion = nn.MSELoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001)

# # Training function
# def train_model(model, train_loader, criterion, optimizer, epochs=5):
#     model.train()
#     for epoch in range(epochs):
#         total_loss = 0
#         for user, item, rating in train_loader:
#             user = user.long()
#             item = item.long()
#             rating = rating.float()
            
#             optimizer.zero_grad()
#             prediction = model(user, item)
#             loss = criterion(prediction, rating)
#             loss.backward()
#             optimizer.step()
            
#             total_loss += loss.item()
        
#         print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")

# # Testing function
# def evaluate_model(model, test_loader, criterion):
#     model.eval()
#     total_loss = 0
#     with torch.no_grad():
#         for user, item, rating in test_loader:
#             user = user.long()
#             item = item.long()
#             rating = rating.float()
            
#             prediction = model(user, item)
#             loss = criterion(prediction, rating)
#             total_loss += loss.item()
    
#     print(f"Test Loss: {total_loss / len(test_loader)}")

# # Train and evaluate the model
# train_model(model, train_loader, criterion, optimizer, epochs=5000)
# evaluate_model(model, test_loader, criterion)

# # Generate the prediction matrix
# def generate_prediction_matrix(model, num_users, num_items):
#     model.eval()
#     prediction_matrix = np.zeros((num_users, num_items))
#     with torch.no_grad():
#         for user in range(num_users):
#             user_tensor = torch.tensor([user] * num_items).long()
#             item_tensor = torch.tensor(list(range(num_items))).long()
#             prediction = model(user_tensor, item_tensor).cpu().numpy()
#             prediction_matrix[user] = prediction
#     return prediction_matrix

# # Create the prediction matrix with user and item IDs
# predicted_ratings_matrix = generate_prediction_matrix(model, num_users, num_items)
# predicted_ratings_df = pd.DataFrame(predicted_ratings_matrix, index=user_ids, columns=restaurant_ids)

# print(predicted_ratings_df)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  review_df['reviewRating'].fillna(review_df['reviewRating'].mean(), inplace=True)


Epoch 1, Loss: 11.755559921264648
Epoch 2, Loss: 11.684123039245605
Epoch 3, Loss: 11.614761352539062
Epoch 4, Loss: 11.547879219055176
Epoch 5, Loss: 11.482355117797852
Epoch 6, Loss: 11.417464256286621
Epoch 7, Loss: 11.352804183959961
Epoch 8, Loss: 11.288195610046387
Epoch 9, Loss: 11.223343849182129
Epoch 10, Loss: 11.157630920410156
Epoch 11, Loss: 11.090470314025879
Epoch 12, Loss: 11.022031784057617
Epoch 13, Loss: 10.951517105102539
Epoch 14, Loss: 10.879622459411621
Epoch 15, Loss: 10.80811882019043
Epoch 16, Loss: 10.735618591308594
Epoch 17, Loss: 10.664815902709961
Epoch 18, Loss: 10.597224235534668
Epoch 19, Loss: 10.530779838562012
Epoch 20, Loss: 10.46427059173584
Epoch 21, Loss: 10.397757530212402
Epoch 22, Loss: 10.328733444213867
Epoch 23, Loss: 10.256929397583008
Epoch 24, Loss: 10.182092666625977
Epoch 25, Loss: 10.104011535644531
Epoch 26, Loss: 10.02278995513916
Epoch 27, Loss: 9.940546035766602
Epoch 28, Loss: 9.854192733764648
Epoch 29, Loss: 9.763288497924805
