# 회귀 기반 추천 시스템

**데이터 구조**

![](https://d.pr/i/YEs8M6+)


여행스타일 8가지 (7단계)
(매우선호 - 중간선호 - 약간선호 - 중립 - 약간선호 - 중간선호 - 매우 선호)
- `TRAVEL_STYL_1` 자연 vs 도시
- `TRAVEL_STYL_2` 숙박 vs 당일
- `TRAVEL_STYL_3` 새로운 지역 vs 익숙한 지역
- `TRAVEL_STYL_4` 편하지만 비싼 숙소 vs 불편하지만 저렴한 숙소
- `TRAVEL_STYL_5` 휴양/휴식 vs 체험활동
- `TRAVEL_STYL_6` 잘알려지지 않은 방문지 vs 알려진 방문지 
- `TRAVEL_STYL_7` 계획에 따른 여행 vs 상황에 따른 여행
- `TRAVEL_STYL_8` 사진촬영 중요하지 않음 vs 사진촬영 중요

여행동기 3가지 (10단계)
- `TRAVEL_MOTIVE_1` 여행의 주요 목적
- `TRAVEL_MOTIVE_2` 여행의 부수적 목적1
- `TRAVEL_MOTIVE_3` 여행의 부수적 목적2

1. 일상적인 환경에서의 탈출
2. 육체적 정신적 휴식
3. 여행 동반자와의 친밀감 증진
4. 자아찾기
5. … (확인 안됨)

In [26]:
# !pip install catboost

In [27]:
import pandas as pd

In [28]:
travel_df = pd.read_csv('./data/travel.csv')
print(travel_df.shape)
travel_df.head()

(34572, 15)


Unnamed: 0,GENDER,AGE_GRP,TRAVEL_STYL_1,TRAVEL_STYL_2,TRAVEL_STYL_3,TRAVEL_STYL_4,TRAVEL_STYL_5,TRAVEL_STYL_6,TRAVEL_STYL_7,TRAVEL_STYL_8,TRAVEL_MOTIVE_1,TRAVEL_COMPANIONS_NUM,VISIT_AREA_NM,MVMN_NM,DGSTFN
0,남,30.0,1.0,4.0,2.0,2.0,6.0,2.0,2.0,7.0,3.0,3.0,미스틱3도,자가용,5.0
1,남,20.0,4.0,1.0,5.0,1.0,1.0,4.0,1.0,6.0,3.0,1.0,에스제이렌트카,대중교통 등,4.0
2,여,50.0,4.0,1.0,2.0,4.0,3.0,3.0,2.0,3.0,1.0,3.0,법환식당,대중교통 등,5.0
3,남,30.0,1.0,1.0,1.0,5.0,6.0,3.0,5.0,7.0,7.0,3.0,에코랜드호텔,자가용,5.0
4,여,20.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,1.0,2.0,윤스타피자앤파스타,자가용,4.0


In [29]:
travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']] \
= travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']].astype('Int64')

travel_df.head()

Unnamed: 0,GENDER,AGE_GRP,TRAVEL_STYL_1,TRAVEL_STYL_2,TRAVEL_STYL_3,TRAVEL_STYL_4,TRAVEL_STYL_5,TRAVEL_STYL_6,TRAVEL_STYL_7,TRAVEL_STYL_8,TRAVEL_MOTIVE_1,TRAVEL_COMPANIONS_NUM,VISIT_AREA_NM,MVMN_NM,DGSTFN
0,남,30,1,4,2,2,6,2,2,7,3,3,미스틱3도,자가용,5.0
1,남,20,4,1,5,1,1,4,1,6,3,1,에스제이렌트카,대중교통 등,4.0
2,여,50,4,1,2,4,3,3,2,3,1,3,법환식당,대중교통 등,5.0
3,남,30,1,1,1,5,6,3,5,7,7,3,에코랜드호텔,자가용,5.0
4,여,20,5,3,3,3,3,3,3,5,1,2,윤스타피자앤파스타,자가용,4.0


In [30]:
from sklearn.model_selection import train_test_split

X = travel_df.drop("DGSTFN", axis=1)
y = travel_df["DGSTFN"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
from catboost import Pool

cat_features = [
    'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 
    'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 
    'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
    'VISIT_AREA_NM', 'MVMN_NM'
]

X_train_pool = Pool(X_train, y_train, cat_features=cat_features)
X_test_pool = Pool(X_test, y_test, cat_features=cat_features)

In [32]:
from catboost import CatBoostRegressor

cb_reg = CatBoostRegressor(
    n_estimators=500,       # 반복 횟수(내부 예측기 개수)
    depth=5,                # 개별 트리의 최대 깊이 
    learning_rate=0.03,
    loss_function='RMSE',   # 손실함수 (기본값)
    eval_metric='RMSE'      # 평가지표 (기본값)
)

cb_reg.fit(X_train_pool, eval_set=X_test_pool, verbose=100)

0:	learn: 0.8340689	test: 0.8384104	best: 0.8384104 (0)	total: 124ms	remaining: 1m 2s
100:	learn: 0.7966735	test: 0.7964172	best: 0.7964172 (100)	total: 3.52s	remaining: 13.9s
200:	learn: 0.7849713	test: 0.7829998	best: 0.7829998 (200)	total: 7.02s	remaining: 10.4s
300:	learn: 0.7802725	test: 0.7786650	best: 0.7786650 (300)	total: 10.3s	remaining: 6.78s
400:	learn: 0.7770440	test: 0.7761124	best: 0.7761099 (399)	total: 13.7s	remaining: 3.38s
499:	learn: 0.7744915	test: 0.7742091	best: 0.7742091 (499)	total: 17.2s	remaining: 0us

bestTest = 0.7742091484
bestIteration = 499



<catboost.core.CatBoostRegressor at 0x22dbc100c50>

In [33]:
col_importance = pd.DataFrame({
    'column': X_train.columns,
    'importance': cb_reg.feature_importances_
})
col_importance

Unnamed: 0,column,importance
0,GENDER,0.266119
1,AGE_GRP,6.705265
2,TRAVEL_STYL_1,6.867693
3,TRAVEL_STYL_2,7.780519
4,TRAVEL_STYL_3,4.550369
5,TRAVEL_STYL_4,10.521132
6,TRAVEL_STYL_5,8.477006
7,TRAVEL_STYL_6,7.500839
8,TRAVEL_STYL_7,7.308692
9,TRAVEL_STYL_8,9.757582


##### 추천 시스템 구축 

1. 방문지 목록을 생성
2. 사용자 특성 입력
3. 가상 만족도 예측 
4. 만족도가 높은 순으로 추천 

In [34]:
visit_area = travel_df['VISIT_AREA_NM'].unique()
visit_area.shape
visit_area[:10]

array(['미스틱3도', '에스제이렌트카', '법환식당', '에코랜드호텔', '윤스타피자앤파스타', '서정리역 1호선',
       '동문식당', '해녀식당', '알맞은시간', '애월온기'], dtype=object)

In [35]:
# 'GENDER', 'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 
# 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 
# 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
# # 'VISIT_AREA_NM', 'MVMN_NM'

user_input = ['여', 60, 4, 4, 4, 4, 4, 4, 4, 4, 1, 2, '방문지', '자가용']
pred_results = []

for area in visit_area:
    user_input[-2] = area
    dgstfn_pred = cb_reg.predict(user_input)
    pred_results.append(dgstfn_pred)

pred_results[:10]

[4.408251197001098,
 4.133894742071065,
 4.393303950922238,
 4.3010761891789695,
 4.130003183901464,
 4.106543332888946,
 4.052051431474843,
 4.130003183901464,
 4.247649201780655,
 4.158688958045763]

In [36]:
result_df = pd.DataFrame({
    'VISIT_AREA_NM': visit_area,
    'DGSTFN_PRED': pred_results
})

result_df.sort_values(by='DGSTFN_PRED', ascending=False).head(10)

Unnamed: 0,VISIT_AREA_NM,DGSTFN_PRED
1410,제주추사관,4.59576
6679,캠파제주,4.588611
6959,BMW드라이빙센터,4.574016
5482,노을해안1014,4.572318
216,제주신라호텔,4.569614
481,하라케케,4.565999
3113,협재해수욕장 야영장,4.565328
1956,호텔화인 제주,4.56385
1675,호텔윈스토리,4.555983
5583,원앙폭포,4.551821
