# 구글 드라이브 연결

In [None]:
# 구글 드라이브 연결(데이터 로드를 위해서)
from google.colab import drive

drive.mount('/content/data')

Mounted at /content/data


# [유럽 호텔 리뷰 데이터](https://www.kaggle.com/datasets/jiashenliu/515k-hotel-reviews-data-in-europe)
> 이 데이터세트에는 유럽 전역의 1,493개 고급 호텔에 대한 515,000개의 고객 리뷰와 평점이 포함되어 있습니다. 또한, 추가 분석을 위해 호텔의 지리적 위치도 제공됩니다.

## 데이터 필드(컬럼) 설명

각 필드에 대한 설명은 다음과 같습니다.

- Hotel_Address
  - 호텔 주소.
- Review_Date
  - 리뷰어가 해당 리뷰를 게시한 날짜입니다.
- Average_Score
  - 지난 1년 동안의 최신 코멘트를 기반으로 계산된 호텔의 평균 점수입니다.
- Hotel_Name
  - 호텔 이름
- Reviewer_Nationality
  - 리뷰어의 국적
- Negative_Review
  - 리뷰어가 호텔에 남긴 부정적인 리뷰입니다.
  - 리뷰어가 부정적인 리뷰를 남기지 않은 경우, '부정적 리뷰 없음'으로 표시됩니다.
- Review_Total_Negative_Word_Counts
  - 부정적인 리뷰에 사용된 총 단어 수입니다.
- Positive_Review
  - 리뷰어가 호텔에 남긴 긍정적인 리뷰입니다.
  - 리뷰어가 부정적인 리뷰를 남기지 않은 경우, 'No Positive'로 표시됩니다.
- Review_Total_Positive_Word_Counts
  - 긍정적인 리뷰에 사용된 총 단어 수입니다.
- Reviewer_Score
  - 리뷰어가 호텔에 부여한 점수(리뷰어의 경험에 따른 점수)
- Total_Number_of_Reviews_Reviewer_Has_Given
  - 리뷰어가 과거에 작성한 리뷰 수입니다.
- Total_Number_of_Reviews
  - 호텔이 보유한 유효한 리뷰의 총 수입니다.
- Tags
  - 리뷰어가 호텔에 남긴 태그입니다.
- days_since_review
  - 검토 날짜와 스크래핑 날짜 사이의 기간.
- Additional_Number_of_Scoring
  - 리뷰를 남기지 않고 서비스에 대한 점수만 남긴 고객도 있습니다. 이 수치는 리뷰 없이 유효한 점수가 몇 개나 있는지를 나타냅니다.
- lat
  - 호텔의 위도
- lng
  - 호텔의 경도

# 데이터 로드

In [None]:
DATA_PATH = "/content/data/MyDrive/ai_lecture/6. Recommendation System/data/515K Hotel Reviews Data in Europe/"

In [None]:
import pandas as pd

hotel_reviews = pd.read_csv(DATA_PATH+'Hotel_Reviews.csv')

## 데이터 확인

In [None]:
# 전체 데이터의 수
len(hotel_reviews)

515738

In [None]:
# 데이터의 컬럼 확인
hotel_reviews.columns

Index(['Hotel_Address', 'Additional_Number_of_Scoring', 'Review_Date',
       'Average_Score', 'Hotel_Name', 'Reviewer_Nationality',
       'Negative_Review', 'Review_Total_Negative_Word_Counts',
       'Total_Number_of_Reviews', 'Positive_Review',
       'Review_Total_Positive_Word_Counts',
       'Total_Number_of_Reviews_Reviewer_Has_Given', 'Reviewer_Score', 'Tags',
       'days_since_review', 'lat', 'lng'],
      dtype='object')

In [None]:
# 데이터 확인
hotel_reviews.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


# 조회 수 기반 추천

## 단순 인기도 기반 추천 단점
- 예를들면, 오늘 나온 뉴스는 사람들이 필요한 아이템이지만 아직 본 사람들이 없기 때문에 평점 또는 조회 수 값이 없기 때문에 추천에 포함되기 힘듭니다.
- 따라서 이런 경우에는 시간(등록일자)이 적용된 새로운 점수가 필요합니다.

`조회 수 기반 점수를 사용하게 되면 해당 문제를 해결할 수 있음`

## 1.신뢰도를 높이기 위해 조회 수가 충분히 많은 호텔들 조회하기
> 추천 신뢰도를 높이기 위해서 리뷰 수가 중위값보다 높은 호텔들 중에서 추천

In [None]:
# 호텔별 리뷰 수 계산
hotel_review_counts = hotel_reviews['Hotel_Name'].value_counts()

# 리뷰 수의 중위값 계산
median_by_reviews = hotel_review_counts.median()

# 중위값보다 많은 리뷰를 가진 호텔 리스트 조회
hotels_above_median = hotel_review_counts[hotel_review_counts > median_by_reviews].index.tolist()

# 중위값보다 많은 리뷰를 가진 호텔들만 적용
hotel_enough_reviews = hotel_reviews[hotel_reviews['Hotel_Name'].isin(hotels_above_median)]

# 데이터 확인
hotel_enough_reviews.head()

Unnamed: 0,Hotel_Address,Additional_Number_of_Scoring,Review_Date,Average_Score,Hotel_Name,Reviewer_Nationality,Negative_Review,Review_Total_Negative_Word_Counts,Total_Number_of_Reviews,Positive_Review,Review_Total_Positive_Word_Counts,Total_Number_of_Reviews_Reviewer_Has_Given,Reviewer_Score,Tags,days_since_review,lat,lng
0,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Russia,I am so angry that i made this post available...,397,1403,Only the park outside of the hotel was beauti...,11,7,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
1,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,8/3/2017,7.7,Hotel Arena,Ireland,No Negative,0,1403,No real complaints the hotel was great great ...,105,7,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double...",0 days,52.360576,4.915968
2,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,Australia,Rooms are nice but for elderly a bit difficul...,42,1403,Location was good and staff were ok It is cut...,21,9,7.1,"[' Leisure trip ', ' Family with young childre...",3 days,52.360576,4.915968
3,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/31/2017,7.7,Hotel Arena,United Kingdom,My room was dirty and I was afraid to walk ba...,210,1403,Great location in nice surroundings the bar a...,26,1,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex...",3 days,52.360576,4.915968
4,s Gravesandestraat 55 Oost 1092 AA Amsterdam ...,194,7/24/2017,7.7,Hotel Arena,New Zealand,You When I booked with your company on line y...,140,1403,Amazing location and building Romantic setting,8,3,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St...",10 days,52.360576,4.915968


## 2.조회 수 기반 점수 생성하기
- 목적(Hacker News Algorithm)
  - 오래됐지만 좋아요가 많은 게시물이 무조건 상단에 고정되지 않게 하여 정보의 신선도 유지
  - 반대로 시간은 조금 지났지만 빠르게 인기 있는 콘텐츠는 빠르게 상단 노출

### [2-1.Hacker News Algorithm 함수 만들기](https://medium.com/hacking-and-gonzo/how-hacker-news-ranking-algorithm-works-1d9b0cf2c08d)
- $pageviews$: 조회수
- $age$: 현재 시각 - 기사 업로드 날짜
- $gravity$: 중력상수(오래된 기사일 수록 score값을 작아지도록 조정하는 상수)
$$
Score = { pageviews -1 \over (age + 2)^{gravity} }
$$

In [None]:
def hacker_news_score(pageviews, age, gravity=1.8):
  return (pageviews - 1) / pow((age+2), gravity)

### 2-2.pageviews(호텔별 조회 수) 구하기
- Hotel_Name
  - 호텔 이름

In [None]:
# 호텔별 조회 수
pageviews = hotel_enough_reviews['Hotel_Name'].value_counts()

pageviews

Unnamed: 0_level_0,count
Hotel_Name,Unnamed: 1_level_1
Britannia International Hotel Canary Wharf,4789
Strand Palace Hotel,4256
Park Plaza Westminster Bridge London,4169
Copthorne Tara Hotel London Kensington,3578
DoubleTree by Hilton Hotel London Tower of London,3212
...,...
Auteuil Tour Eiffel,196
Sofitel Legend The Grand Amsterdam,195
Boscolo Milano Autograph Collection,195
Hotel Alimara,195


### 2-3.age(얼마나 오래되었는지) 구하기

- days_since_review
  - 검토 날짜와 스크래핑 날짜 사이의 기간.

In [None]:
hotel_enough_reviews[['Hotel_Name', 'days_since_review']].head()

Unnamed: 0,Hotel_Name,days_since_review
0,Hotel Arena,0 days
1,Hotel Arena,0 days
2,Hotel Arena,3 days
3,Hotel Arena,3 days
4,Hotel Arena,10 days


```python

# days_since_review는 얼마나 오래전에 리뷰를 작성했는지에 대한 데이터임   
hotel_enough_reviews['days_since_review']\
  # days_since_review의 값을 숫자로 변환
  .map(lambda x: int(x.replace('days','').replace('day','').replace(' ','')))

```

In [None]:
hotel_enough_reviews['days_since_review'] = hotel_enough_reviews['days_since_review']\
                                              .map(lambda x: int(x.replace('days','').replace('day','').replace(' ','')))

hotel_enough_reviews[['Hotel_Name', 'days_since_review']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hotel_enough_reviews['days_since_review'] = hotel_enough_reviews['days_since_review']\


Unnamed: 0,Hotel_Name,days_since_review
0,Hotel Arena,0
1,Hotel Arena,0
2,Hotel Arena,3
3,Hotel Arena,3
4,Hotel Arena,10


```python

# 호텔별
hotel_enough_reviews.groupby(['Hotel_Name'])\
  # 평균 리뷰 날짜
  .agg({'days_since_review':'mean'})\
  # 작성된 리뷰 날짜를 기준으로 정렬하기
  .sort_values(by='days_since_review', ascending=False).reset_index()

```

In [None]:
enough_reviews_by_hotel = hotel_enough_reviews.groupby(['Hotel_Name'])\
  .agg({'days_since_review':'mean'})\
  .sort_values(by='days_since_review', ascending=False).reset_index()

enough_reviews_by_hotel.head()

Unnamed: 0,Hotel_Name,days_since_review
0,The Principal London,581.201404
1,Hotel du Louvre in the Unbound Collection by H...,475.706294
2,Hyatt Regency Paris Etoile,469.025172
3,The Chess Hotel,460.604255
4,Pullman Paris Montparnasse,458.87218


> days_since_review의 값은 위에서 작성한 코드에 의해서 평균 리뷰 날짜 데이터이다.   
> 따라서 컬럼명을 age로 변경할 수 있다.

In [None]:
enough_reviews_by_hotel.rename(columns={'days_since_review':'age'}, inplace=True)

# 호텔별 평균 리뷰 날짜를 기준으로 조회
enough_reviews_by_hotel.head()

Unnamed: 0,Hotel_Name,age
0,The Principal London,581.201404
1,Hotel du Louvre in the Unbound Collection by H...,475.706294
2,Hyatt Regency Paris Etoile,469.025172
3,The Chess Hotel,460.604255
4,Pullman Paris Montparnasse,458.87218


### 2-4.pageviews & age 합치기

In [None]:
enough_reviews_by_hotel['pageviews'] = enough_reviews_by_hotel['Hotel_Name'].map(pageviews)

print(enough_reviews_by_hotel.shape)
# 호텔별 리뷰 수(pageviews) & 가장 최근에 작성한 리뷰 날짜(age)
enough_reviews_by_hotel.head()

(743, 3)


Unnamed: 0,Hotel_Name,age,pageviews
0,The Principal London,581.201404,1425
1,Hotel du Louvre in the Unbound Collection by H...,475.706294,286
2,Hyatt Regency Paris Etoile,469.025172,874
3,The Chess Hotel,460.604255,235
4,Pullman Paris Montparnasse,458.87218,266


### 2-5.Hacker News Score 구하기
> hacker_news_score 함수를 이용하여 호텔별 Score 구하기

In [None]:
enough_reviews_by_hotel['hacker_news_score'] = enough_reviews_by_hotel.apply(lambda row: hacker_news_score(row['pageviews'], row['age']), axis=1)

print(enough_reviews_by_hotel.shape)
enough_reviews_by_hotel.head()

(743, 4)


Unnamed: 0,Hotel_Name,age,pageviews,hacker_news_score
0,The Principal London,581.201404,1425,0.014964
1,Hotel du Louvre in the Unbound Collection by H...,475.706294,286,0.004289
2,Hyatt Regency Paris Etoile,469.025172,874,0.013475
3,The Chess Hotel,460.604255,235,0.003731
4,Pullman Paris Montparnasse,458.87218,266,0.004254


## 3.조회 수 기반 5개 호텔 추천하기

In [None]:
no_ranking = 5 # 추천 수량

enough_reviews_by_hotel.sort_values(by=['hacker_news_score'], ascending=False)[:no_ranking]

Unnamed: 0,Hotel_Name,age,pageviews,hacker_news_score
742,The Ned,37.377432,257,0.344187
729,citizenM Tower of London,187.16934,1683,0.13412
738,Park Plaza London Waterloo,104.0,552,0.124624
740,Good Hotel London,103.868762,541,0.122409
362,Britannia International Hotel Canary Wharf,361.22844,4789,0.117986


## 4.인기도와 조회 수를 적용한 호텔 추천하기
> 최신 조회도 중요하지만, 사용자들의 선호도 추천에서 중요한 지표다.

### 4-1.호텔별 선호도 추가하기

In [None]:
reviewer_score_mean = hotel_enough_reviews.groupby(['Hotel_Name'])\
  .agg({'Reviewer_Score':'mean'})\
  .reset_index()

# 호텔별 평균 리뷰 점수 구하기
reviewer_score_mean

Unnamed: 0,Hotel_Name,Reviewer_Score
0,25hours Hotel beim MuseumsQuartier,8.983309
1,88 Studios,8.489107
2,AC Hotel Barcelona Forum a Marriott Lifestyle ...,8.001384
3,Acad mie H tel Saint Germain,9.064706
4,Acca Palace,8.590909
...,...,...
738,citizenM Amsterdam,8.770289
739,citizenM London Bankside,9.112261
740,citizenM London Shoreditch,9.092005
741,citizenM Tower of London,9.135591


In [None]:
# 호텔별 평균 리뷰 점수 추가하기
enough_reviews_by_hotel = enough_reviews_by_hotel.merge(reviewer_score_mean, on='Hotel_Name')

print(enough_reviews_by_hotel.shape)
enough_reviews_by_hotel.head()

(743, 5)


Unnamed: 0,Hotel_Name,age,pageviews,hacker_news_score,Reviewer_Score
0,The Principal London,581.201404,1425,0.014964,8.029333
1,Hotel du Louvre in the Unbound Collection by H...,475.706294,286,0.004289,8.273427
2,Hyatt Regency Paris Etoile,469.025172,874,0.013475,6.824485
3,The Chess Hotel,460.604255,235,0.003731,8.929787
4,Pullman Paris Montparnasse,458.87218,266,0.004254,7.902632


### 4-2.인기도와 조회 수를 적용하기 위해서 스케일(데이터 크기)를 조정하기

> 조회 수 점수(hacker_news_score)와 인기도 점수(Reviewer_Score)의 크기가 다르기 때문에   
> 단순히 덧샘을 하면 왜곡이 될 수 있음

In [None]:
enough_reviews_by_hotel[['hacker_news_score', 'Reviewer_Score']].describe()

Unnamed: 0,hacker_news_score,Reviewer_Score
count,743.0,743.0
mean,0.016772,8.429262
std,0.020162,0.590781
min,0.003468,6.009465
25%,0.007352,8.078032
50%,0.011144,8.454392
75%,0.019371,8.842017
max,0.344187,9.718937


> minmax scaling을 이용하여 각 점수들의 크기를 통일 시킴   
- minmax scaling이란
  - 최대값(max)를 1으로 변환 & 최소값(min)을 0으로 변환
  - 따라서 모든 값들을 0 ~ 1 사이로 크기를 변환하는 방법

$$
X_{scaled} = {X - X_{min} \over X_{max} - X_{min}}
$$

In [None]:
def minmax_scaling(data, min, max):
  scale = max - min
  return (data - min) / scale

> 조회 수 점수(hacker_news_score) 스케일링하기

In [None]:
# 최소값
hacker_min = enough_reviews_by_hotel['hacker_news_score'].min()
# 최대값
hacker_max = enough_reviews_by_hotel['hacker_news_score'].max()

# 스케일링 적용
enough_reviews_by_hotel['scaled_hacker_news_score'] = enough_reviews_by_hotel['hacker_news_score'].map(
  lambda x: minmax_scaling(x, hacker_min, hacker_max)
)

# 결과 검증
enough_reviews_by_hotel[['hacker_news_score', 'scaled_hacker_news_score']].describe()

Unnamed: 0,hacker_news_score,scaled_hacker_news_score
count,743.0,743.0
mean,0.016772,0.039048
std,0.020162,0.059176
min,0.003468,0.0
25%,0.007352,0.011401
50%,0.011144,0.022529
75%,0.019371,0.046675
max,0.344187,1.0


> 인기도 점수(Reviewer_Score) 스케일링하기

In [None]:
# 최소값
reviewer_min = enough_reviews_by_hotel['Reviewer_Score'].min()
# 최대값
reviewer_max = enough_reviews_by_hotel['Reviewer_Score'].max()

# 스케일링 적용
enough_reviews_by_hotel['scaled_Reviewer_Score'] = enough_reviews_by_hotel['Reviewer_Score'].map(
  lambda x: minmax_scaling(x, reviewer_min, reviewer_max)
)

# 결과 검증
enough_reviews_by_hotel[['Reviewer_Score', 'scaled_Reviewer_Score']].describe()

Unnamed: 0,Reviewer_Score,scaled_Reviewer_Score
count,743.0,743.0
mean,8.429262,0.652329
std,0.590781,0.159263
min,6.009465,0.0
25%,8.078032,0.557645
50%,8.454392,0.659104
75%,8.842017,0.7636
max,9.718937,1.0


> hacker_news_score와 Reviewer_Score의 크기가 같아짐.

In [None]:
enough_reviews_by_hotel[['scaled_hacker_news_score', 'scaled_Reviewer_Score']].describe()

Unnamed: 0,scaled_hacker_news_score,scaled_Reviewer_Score
count,743.0,743.0
mean,0.039048,0.652329
std,0.059176,0.159263
min,0.0,0.0
25%,0.011401,0.557645
50%,0.022529,0.659104
75%,0.046675,0.7636
max,1.0,1.0


### 4-3.인기도와 조회 수를 적용하여 새로운 Score 구하기

> 각 항목별 중요도(weight)를 적용하여 Score 생성   
> 아래 코드는 인기도보다는 조회 수를 조금더 중요하게 적용한 코드임

In [None]:
# hacker_wieght가 reviewer_weight보다 높게 적용하였음
# -> (해석) 평균 평점보다는 최근에 작성된 평점이 높은 호텔 추천
hacker_weight = 1.0
reviewer_weight = 0.7

# 인기도와 조회 수를 적용한 새로운 score 생성하기
enough_reviews_by_hotel['score'] = enough_reviews_by_hotel.apply(
  # 항목별 중요도(weight)를 적용함
  lambda row: row['scaled_hacker_news_score']*hacker_weight + row['scaled_Reviewer_Score']*reviewer_weight,
  axis=1
)

# 결과 확인하기
enough_reviews_by_hotel[['Hotel_Name', 'score', 'scaled_hacker_news_score', 'scaled_Reviewer_Score']].head()

Unnamed: 0,Hotel_Name,score,scaled_hacker_news_score,scaled_Reviewer_Score
0,The Principal London,0.414901,0.033739,0.544516
1,Hotel du Louvre in the Unbound Collection by H...,0.429633,0.002409,0.610319
2,Hyatt Regency Paris Etoile,0.18317,0.029371,0.219713
3,The Chess Hotel,0.551855,0.000772,0.787261
4,Pullman Paris Montparnasse,0.359559,0.002307,0.51036


### 4-4.호텔 추천하기
- 단순 평균 평점으로 추천하게 된다면, Reviewer_Score의 값이 가장 높은 호텔이 첫번째로 추천되어야 한다.
- 하지만 hacker_new_score에 의해서 최근에 평가를 많이 받은 호텔들이 상위권에 추천된 것을 볼 수 있다.

In [None]:
no_ranking = 5 # 추천 수량

enough_reviews_by_hotel.sort_values(by=['score'], ascending=False)\
  [['Hotel_Name', 'score', 'scaled_hacker_news_score', 'scaled_Reviewer_Score']]\
  [:no_ranking]

Unnamed: 0,Hotel_Name,score,scaled_hacker_news_score,scaled_Reviewer_Score
742,The Ned,1.597815,1.0,0.854021
729,citizenM Tower of London,0.973379,0.38346,0.842742
696,Intercontinental London The O2,0.924727,0.275252,0.927822
738,Park Plaza London Waterloo,0.856984,0.355588,0.71628
736,Montcalm Royal London House City of London,0.840627,0.239556,0.858672
