In [1]:
import pandas as pd
import numpy as np
import math
from geopy.distance import geodesic
from surprise import Dataset, Reader, KNNBasic
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

###### 관광지 데이터

In [2]:
data = pd.read_csv('./done/merge_test.csv')
df = pd.DataFrame(data)
# 위도 : latitude / 경도 : longitude

###### 사용자 데이터

In [3]:
u_data = pd.read_csv('./done/user_test.csv')
u_df = pd.DataFrame(u_data)

In [4]:
df

Unnamed: 0,name,rating,review,mean_price,latitude,longitude,theme_num
0,김녕다올 무인소품점,0.00,197.0,18964,33.557524,126.748232,25
1,일루다,0.00,78.0,18964,33.474055,126.527442,25
2,제주소품샵 올망,0.00,1223.0,3000,33.416074,126.302814,25
3,선물가게바나나 제주 소품샵 서귀포중문점,4.75,6379.0,8125,33.258022,126.417179,25
4,소랑아시 소품샵&선물가게,0.00,11370.0,18964,33.255589,126.414884,25
...,...,...,...,...,...,...,...
1151,달제주,0.00,99.0,28400,33.333112,126.804187,21
1152,제주올레길4코스,0.00,324.0,4927,33.327824,126.767908,21
1153,두머니물공원,0.00,46.0,5923,33.231082,126.507044,21
1154,제주숲길,0.00,96.0,33821,33.238634,126.516463,21


In [5]:
u_df

Unnamed: 0,u_id,latitude,longitude,price_weight,rating_weight,review_weight,distance_weight
0,1,33.491361,126.496260,0.51,0.09,0.12,0.28
1,2,33.285343,126.183949,0.22,0.21,0.41,0.16
2,3,33.501972,126.532631,0.38,0.16,0.38,0.08
3,4,33.435701,126.291566,0.14,0.14,0.30,0.42
4,5,35.576263,128.284164,0.16,0.21,0.42,0.21
...,...,...,...,...,...,...,...
11153,11154,33.316225,126.718446,0.12,0.23,0.47,0.18
11154,11155,33.451278,126.326753,0.49,0.12,0.28,0.11
11155,11156,33.460609,126.914936,0.31,0.14,0.41,0.14
11156,11157,33.463626,126.423728,0.22,0.37,0.07,0.34


##### 사용자 1명 특정

In [6]:
user = u_df.iloc[[0]]

In [7]:
user

Unnamed: 0,u_id,latitude,longitude,price_weight,rating_weight,review_weight,distance_weight
0,1,33.491361,126.49626,0.51,0.09,0.12,0.28


###### 특성 점수화

- 자연 로그 변환

In [8]:
df['review_log'] = df.apply(lambda x: np.log(x['review']) if x['review'] > 0 else 0, axis=1)

In [9]:
df['rating_log'] = df.apply(lambda x: np.log(x['rating']) if x['rating'] > 0 else 0, axis=1)

In [10]:
df['price_log'] = df.apply(lambda x: np.log(x['mean_price']), axis=1)

- 사용자의 위치를 기반으로 직선 거리 계산

In [11]:
from geopy.distance import distance

# 사용자의 위치 정보
user_location = (user['latitude'].iloc[0], user['longitude'].iloc[0])

distances = []

for spot_index, spot_row in df.iterrows():
    # 관광지의 위치 정보
    spot_location = (spot_row['latitude'], spot_row['longitude'])
    
    # 두 지점 간의 거리 계산 (예시: km 단위)
    dist = distance(user_location, spot_location).km
    distances.append(dist)

# 거리 데이터를 데이터프레임에 추가
df['distance'] = distances

In [12]:
df

Unnamed: 0,name,rating,review,mean_price,latitude,longitude,theme_num,review_log,rating_log,price_log,distance
0,김녕다올 무인소품점,0.00,197.0,18964,33.557524,126.748232,25,5.283204,0.000000,9.850298,24.530644
1,일루다,0.00,78.0,18964,33.474055,126.527442,25,4.356709,0.000000,9.850298,3.476090
2,제주소품샵 올망,0.00,1223.0,3000,33.416074,126.302814,25,7.109062,0.000000,8.006368,19.829011
3,선물가게바나나 제주 소품샵 서귀포중문점,4.75,6379.0,8125,33.258022,126.417179,25,8.760767,1.558145,9.002701,26.905815
4,소랑아시 소품샵&선물가게,0.00,11370.0,18964,33.255589,126.414884,25,9.338734,0.000000,9.850298,27.224199
...,...,...,...,...,...,...,...,...,...,...,...
1151,달제주,0.00,99.0,28400,33.333112,126.804187,21,4.595120,0.000000,10.254144,33.592196
1152,제주올레길4코스,0.00,324.0,4927,33.327824,126.767908,21,5.780744,0.000000,8.502486,31.104409
1153,두머니물공원,0.00,46.0,5923,33.231082,126.507044,21,3.828641,0.000000,8.686598,28.885224
1154,제주숲길,0.00,96.0,33821,33.238634,126.516463,21,4.564348,0.000000,10.428837,28.093205


##### min-max 정규화 score

- distance_score, review_score, rating_score, price_score

In [13]:
def minmax_score(col):
    return (df[col] - df[col].min()) / (df[col].max() - df[col].min())

In [14]:
df['distance_score'] = minmax_score('distance')

In [15]:
df['review_score'] = minmax_score('review_log')

In [16]:
df['rating_score'] = minmax_score('rating_log')

In [17]:
df['price_score'] = minmax_score('price_log')

In [18]:
df

Unnamed: 0,name,rating,review,mean_price,latitude,longitude,theme_num,review_log,rating_log,price_log,distance,distance_score,review_score,rating_score,price_score
0,김녕다올 무인소품점,0.00,197.0,18964,33.557524,126.748232,25,5.283204,0.000000,9.850298,24.530644,0.077554,0.477878,0.00000,0.507677
1,일루다,0.00,78.0,18964,33.474055,126.527442,25,4.356709,0.000000,9.850298,3.476090,0.010749,0.394075,0.00000,0.507677
2,제주소품샵 올망,0.00,1223.0,3000,33.416074,126.302814,25,7.109062,0.000000,8.006368,19.829011,0.062636,0.643031,0.00000,0.227030
3,선물가게바나나 제주 소품샵 서귀포중문점,4.75,6379.0,8125,33.258022,126.417179,25,8.760767,1.558145,9.002701,26.905815,0.085090,0.792432,0.96813,0.378672
4,소랑아시 소품샵&선물가게,0.00,11370.0,18964,33.255589,126.414884,25,9.338734,0.000000,9.850298,27.224199,0.086100,0.844710,0.00000,0.507677
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151,달제주,0.00,99.0,28400,33.333112,126.804187,21,4.595120,0.000000,10.254144,33.592196,0.106305,0.415639,0.00000,0.569142
1152,제주올레길4코스,0.00,324.0,4927,33.327824,126.767908,21,5.780744,0.000000,8.502486,31.104409,0.098411,0.522882,0.00000,0.302539
1153,두머니물공원,0.00,46.0,5923,33.231082,126.507044,21,3.828641,0.000000,8.686598,28.885224,0.091370,0.346310,0.00000,0.330561
1154,제주숲길,0.00,96.0,33821,33.238634,126.516463,21,4.564348,0.000000,10.428837,28.093205,0.088857,0.412856,0.00000,0.595730


###### 가중 점수 계산 = 가중치 * 특성

In [19]:
# 가중 점수 계산
df['score'] = (
    user['rating_weight'][0] * df['rating_score'] +
    user['review_weight'][0] * df['review_score'] +
    user['price_weight'][0] * df['price_score'] +
    user['distance_weight'][0] * df['distance_score']
)

In [20]:
df['score']

0       0.337975
1       0.309214
2       0.210487
3       0.399171
4       0.384388
          ...   
1151    0.369905
1152    0.244596
1153    0.235727
1154    0.378245
1155    0.278990
Name: score, Length: 1156, dtype: float64

##### 사용자와 관광지 데이터 병합

In [21]:
selected_columns = ['name','review_score','rating_score','price_score','distance_score']

In [22]:
spot_df = df[selected_columns]

In [23]:
spot_df

Unnamed: 0,name,review_score,rating_score,price_score,distance_score
0,김녕다올 무인소품점,0.477878,0.00000,0.507677,0.077554
1,일루다,0.394075,0.00000,0.507677,0.010749
2,제주소품샵 올망,0.643031,0.00000,0.227030,0.062636
3,선물가게바나나 제주 소품샵 서귀포중문점,0.792432,0.96813,0.378672,0.085090
4,소랑아시 소품샵&선물가게,0.844710,0.00000,0.507677,0.086100
...,...,...,...,...,...
1151,달제주,0.415639,0.00000,0.569142,0.106305
1152,제주올레길4코스,0.522882,0.00000,0.302539,0.098411
1153,두머니물공원,0.346310,0.00000,0.330561,0.091370
1154,제주숲길,0.412856,0.00000,0.595730,0.088857


In [24]:
# 두 데이터프레임을 조합하기 위한 키 컬럼 생성
user.loc[:, 'key'] = 0
df.loc[:, 'key'] = 0

# 두 데이터프레임을 병합 (cross join)
merged_data = user.merge(df, on='key', how='outer')

# key 컬럼 삭제
merged_data.drop(columns='key', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user.loc[:, 'key'] = 0


In [25]:
merged_data

Unnamed: 0,u_id,latitude_x,longitude_x,price_weight,rating_weight,review_weight,distance_weight,name,rating,review,...,theme_num,review_log,rating_log,price_log,distance,distance_score,review_score,rating_score,price_score,score
0,1,33.491361,126.49626,0.51,0.09,0.12,0.28,김녕다올 무인소품점,0.00,197.0,...,25,5.283204,0.000000,9.850298,24.530644,0.077554,0.477878,0.00000,0.507677,0.337975
1,1,33.491361,126.49626,0.51,0.09,0.12,0.28,일루다,0.00,78.0,...,25,4.356709,0.000000,9.850298,3.476090,0.010749,0.394075,0.00000,0.507677,0.309214
2,1,33.491361,126.49626,0.51,0.09,0.12,0.28,제주소품샵 올망,0.00,1223.0,...,25,7.109062,0.000000,8.006368,19.829011,0.062636,0.643031,0.00000,0.227030,0.210487
3,1,33.491361,126.49626,0.51,0.09,0.12,0.28,선물가게바나나 제주 소품샵 서귀포중문점,4.75,6379.0,...,25,8.760767,1.558145,9.002701,26.905815,0.085090,0.792432,0.96813,0.378672,0.399171
4,1,33.491361,126.49626,0.51,0.09,0.12,0.28,소랑아시 소품샵&선물가게,0.00,11370.0,...,25,9.338734,0.000000,9.850298,27.224199,0.086100,0.844710,0.00000,0.507677,0.384388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1151,1,33.491361,126.49626,0.51,0.09,0.12,0.28,달제주,0.00,99.0,...,21,4.595120,0.000000,10.254144,33.592196,0.106305,0.415639,0.00000,0.569142,0.369905
1152,1,33.491361,126.49626,0.51,0.09,0.12,0.28,제주올레길4코스,0.00,324.0,...,21,5.780744,0.000000,8.502486,31.104409,0.098411,0.522882,0.00000,0.302539,0.244596
1153,1,33.491361,126.49626,0.51,0.09,0.12,0.28,두머니물공원,0.00,46.0,...,21,3.828641,0.000000,8.686598,28.885224,0.091370,0.346310,0.00000,0.330561,0.235727
1154,1,33.491361,126.49626,0.51,0.09,0.12,0.28,제주숲길,0.00,96.0,...,21,4.564348,0.000000,10.428837,28.093205,0.088857,0.412856,0.00000,0.595730,0.378245


##### 데이터 간의 유사도 계산(코사인)

In [69]:
# 사용자 데이터와 관광지 데이터 간의 유사도를 계산합니다 (예시: 코사인 유사도).
user_features = merged_data[['rating_weight', 'review_weight', 'price_weight', 'distance_weight']]
spot_features = merged_data[['rating_score', 'review_score', 'price_score', 'distance_score']]
similarities = cosine_similarity(user_features, spot_features)

##### 추천리스트 생성

In [70]:
# 추천 생성 (예시: 사용자당 상위 N개 관광지 추천)
N = 10  # 추천할 관광지 개수

recommendations = []
for i in range(len(user)):
    user_similarities = similarities[i]
    top_indices = user_similarities.argsort()[-N:][::-1]  # 상위 N개 유사한 관광지 인덱스
    top_recommendations = spot_df.iloc[top_indices]
    recommendations.append(top_recommendations)

# 결과 출력
for i, user_recommendations in enumerate(recommendations):
    print(f"Recommendations for User {i + 1}:")
    print(user_recommendations)

Recommendations for User 1:
         name  review_score  rating_score  price_score  distance_score
270     성산일출몰      0.208274           0.0     0.507677        0.398769
152       만물상      0.162069           0.0     0.476683        0.126618
203      오조상점      0.162069           0.0     0.507677        0.124267
287       데이지      0.099372           0.0     0.476683        0.106799
132    아노마토피아      0.145577           0.0     0.507677        0.105045
164      더니어엘      0.062697           0.0     0.507677        0.103812
160  마루쌀롱 청수리      0.099372           0.0     0.507677        0.096395
15   하루애제주소품샵      0.145577           0.0     0.507677        0.098922
286       뎁스터      0.125394           0.0     0.476683        0.087055
18      탐라소호샵      0.000000           0.0     0.355989        0.089072


- 정확도 계산

In [93]:

# 사용자 데이터와 관광지 데이터를 데이터프레임으로 변환
user_item_ratings = pd.DataFrame(merged_data[['rating_score', 'review_score', 'price_score', 'distance_score']])
user_item_ratings['total_score'] = merged_data['score']  # 총 평점 추가

# user_recommendations 데이터프레임 생성 (예시: 랜덤하게 생성)
import random
num_users, num_items = user_item_ratings.shape
user_recommendations = pd.DataFrame({
    'rating_score': [random.uniform(0, 5) for _ in range(num_items)],
    'review_score': [random.uniform(0, 5) for _ in range(num_items)],
    'price_score': [random.uniform(0, 5) for _ in range(num_items)],
    'distance_score': [random.uniform(0, 5) for _ in range(num_items)],
    'total_score': [random.uniform(0, 5) for _ in range(num_items)]
})

# 유사도 계산 (예시: 코사인 유사도)
similarities = cosine_similarity(user_item_ratings, user_recommendations)

# 추천 생성 (예시: 사용자당 상위 N개 관광지 추천)
N = 10  # 추천할 관광지 개수

recommendations = []
for i in range(num_users):
    user_similarities = similarities[i]
    top_indices = user_similarities.argsort()[-N:][::-1]  # 상위 N개 유사한 관광지 인덱스
    top_recommendations = user_item_ratings.iloc[top_indices]
    recommendations.append(top_recommendations)

# 정확도 계산 함수
def accuracy(user_ratings, recommendations, top_n=10):
    num_users, num_items = user_ratings.shape
    correct_recommendations = 0

    for i in range(num_users):
        user_ratings_i = user_ratings.iloc[i]
        recommended_items = recommendations[i][:top_n]
        for item_id in recommended_items.index:
            if user_ratings_i[item_id] > 0:
                correct_recommendations += 1

    total_recommendations = num_users * top_n
    accuracy = correct_recommendations / total_recommendations
    return accuracy

# 정확도 계산 함수 호출
top_n = 10  # 상위 N개 아이템을 추천으로 고려
acc = accuracy(user_item_ratings, recommendations, top_n)
print(f"Top-{top_n} 정확도: {acc * 100:.2f}%")

Top-10 정확도: 43.83%
