# 다이닝 코드 데이터

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np

data = pd.read_csv('diningcode_data_crawling_20260125_1542.csv')

In [2]:
data = data.drop(columns=['Unnamed: 0'],axis=1)
data.shape

(12627, 15)

## Preprocessing data


In [3]:
# 다코 미식가 / user_name 분리
daco_list = dict()
name_list = list()
for i,val in enumerate(data['user_name']):
    if "다코미식가" in val:
        daco_list[i] = 1
    else:
        daco_list[i] = 0
    name_list.append(data['user_name'][i].replace('다코미식가',''))
data['daco_gourmand'] = daco_list

name_list = [name.replace('\r\n', '') for name in name_list]
data['user_name'] = name_list

In [4]:
data['user_rating'] = data['user_rating'].str.replace('점','').astype(float)

In [5]:
unique_vals = data['taste'].unique()
mapping_dict = {
    unique_vals[0]:3,
    unique_vals[1]:2,
    unique_vals[2]:1
}
data['taste_enc'] = data['taste'].map(mapping_dict)
del mapping_dict, unique_vals

unique_vals = data['service'].unique()
mapping_dict = {
    unique_vals[0]:3,
    unique_vals[1]:2,
    unique_vals[2]:1
}
data['service_enc'] = data['service'].map(mapping_dict)
del mapping_dict, unique_vals

unique_vals = data['price'].unique()
mapping_dict = {
    unique_vals[0]:3,
    unique_vals[1]:2,
    unique_vals[2]:1
}
data['price_enc'] = data['price'].map(mapping_dict)
del mapping_dict, unique_vals

In [6]:
le = LabelEncoder()
data['user_id'] = le.fit_transform(data['user_name'])
data['item_id'] = le.fit_transform(data['item_name'])

In [7]:
np.sum(data['user_name'].value_counts() > 1)

np.int64(2208)

In [8]:
data = data.drop_duplicates()

In [9]:
data.shape

(8723, 21)

## Project_Main (Founding Recommnedation model)
1. Baseline (DeepFM or MF model)
2. DeepCoNN + MF
3. LLM (KoBERT,Gemma) + MF model

### 1. Baseline (Matrix Factorization)

In [13]:
user_item_mat = pd.pivot_table(data=data,index='item_id',
                               columns='user_id',values='user_rating')
print('sparsity : ',np.count_nonzero(np.isnan(user_item_mat)) / user_item_mat.size)
print(f'user_item_mat shape : {user_item_mat.shape}')

sparsity :  0.9855957418672309
user_item_mat shape : (160, 3781)


In [14]:
mf_data = data[['user_name','item_name','user_rating']]
mf_data.columns = ['user','item','rating']

mf_train = mf_data.groupby('user').sample(frac=0.8,random_state=42)
mf_train_ind = mf_train.index
mf_test = mf_data.drop(mf_train_ind)

In [15]:
import sys
sys.path.append('../..')  # Go up two levels to reach the directory containing 'Study'

from Study.RecSys.matrixfactorization import matfac

k=10
lr=0.001
reg_param = 0.02
epochs=50

mf_model = matfac.MatrixFactorization(k,lr,reg_param,epochs)

mf_model.fit(mf_train)
mf_pred,mf_test = mf_model.predict(mf_test)

Epoch : 0 , Loss : 2227.741625 , Rooted Loss: 47.20
Epoch : 10 , Loss : 1890.768262 , Rooted Loss: 43.48
Epoch : 20 , Loss : 1774.133812 , Rooted Loss: 42.12
Epoch : 30 , Loss : 1717.933087 , Rooted Loss: 41.45
Epoch : 40 , Loss : 1686.476004 , Rooted Loss: 41.07


In [None]:
y_pred = mf_pred
y_true = mf_test
k=10
ndcg_k = []
for user_num in y_pred['user'].unique():
    top_pred_items = y_pred.loc[(y_pred['user']==user_num)].sort_values('rating',ascending=False)
    pred_sequence = top_pred_items['item'][:k].values

    test_items = y_true.loc[y_true['user']==user_num]
    ideal_rel_score = test_items.sort_values('rating',ascending=False)[:k]['rating'].values
    rel_score = test_items.set_index('item').reindex(pred_sequence)['rating'].values
    dcg_k = np.sum((np.pow(2,rel_score) -1) / np.log2(np.arange(2,len(rel_score)+2)))
    idcg_k = np.sum((np.pow(2,ideal_rel_score) -1) / np.log2(np.arange(2,len(ideal_rel_score)+2)))
    ndcg_k.append(dcg_k / idcg_k if idcg_k>0 else 0)


In [19]:
from Study.RecSys.matrixfactorization.preprocessing import precision_at_k,ndcg_at_k,recall_at_k
from sklearn.metrics import root_mean_squared_error
rmse_par = root_mean_squared_error(mf_pred['rating'].values,mf_test['rating'].values)
prec_at_k_par = precision_at_k(mf_pred,mf_test,k=10)
rec_at_k_par = recall_at_k(mf_pred,mf_test,k=10)
ndcg_at_k_par = ndcg_at_k(mf_pred,mf_test,k=10)

print(rmse_par,prec_at_k_par,rec_at_k_par,ndcg_at_k_par)

4.499294277077793 0.12479253112033196 1.0 0.9930942067724283


In [19]:
def ndcg_at_k(y_true, y_score, k=-1): #Vector의 계산으로 이루어짐
    y_true = np.array(y_true)
    y_score = np.array(y_score)
    n = len(y_true)
    # k가 -1 또는 데이터 전체보다 크면 k=n으로 보정
    if k == -1 or k > n:
        k = n

    order = np.argsort(y_score)[::-1]
    y_true_sorted = y_true[order[:k]]  # k 길이만큼만 자름
    dcg = np.sum((2 ** y_true_sorted - 1) / np.log2(np.arange(2, k + 2)))
    best_dcg = np.sum((2 ** np.sort(y_true)[::-1][:k] - 1) / np.log2(np.arange(2, k + 2)))

    return dcg / best_dcg if best_dcg > 0 else 0.0

ndcg_k = []
for i,col in enumerate(R_pred):
    ndcg_k.append(ndcg_at_k(R[i],R_pred[i],k=-1))
print('ndcg@k mean:' , np.mean(ndcg_k))

ndcg@k mean: 0.3305156015183604


### 2. deepCONN