In [242]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, MaxAbsScaler


# 한글 폰트 설정
import matplotlib.pyplot as plt
import platform
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
plt.rcParams['axes.unicode_minus'] = False

if platform.system() == 'Windows':
    path = "c:/Windows/Fonts/malgun.ttf"
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
elif platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Linux':
    rc('font', family='NanumBarunGothic')
else:
    print('Unknown system... sorry~')

In [243]:
path = "../datas/"
nv_data = pd.read_csv(path+'v5_category_re.csv')
kk_data = pd.read_csv(path+'kakao_review_cat_revised.csv', index_col=0)
nv_data['user_info'] = nv_data['user_info'].fillna(0)
nv_data = nv_data[nv_data['user_info'] != 0]
nv_data['user_info'] = nv_data['user_info'].apply(lambda x: x.split('\n')[-1])
nv_data['visit_info'] = nv_data['visit_info'].apply(lambda x: x.split('번째')[0][-1])
nv_data = nv_data[nv_data['star'] != 'star']
len(nv_data)

120983

In [244]:
nv_data['star'] = nv_data['star'].astype('float64')
nv_data['user_info'] = nv_data['user_info'].astype('float64')
nv_data['visit_info'] = nv_data['visit_info'].astype('float64')
nv_data = nv_data.drop(['addr', 'base_addr','user_name', 'visit_info'],1)
nv_data = nv_data.rename(columns={'title': 'camp', 'highlight_review':'review', 'star':'point', 'user_info':'avg_point'})

In [245]:
nv_data = nv_data[['camp', 'review','point', 'category', 'avg_point']]
nv_data['point'] = nv_data['point'].astype('float64')
nv_data['avg_point'] = nv_data['avg_point'].astype('float64')

In [246]:
reviews_df = pd.concat([nv_data, kk_data],0)
reviews_df.head(2)

Unnamed: 0,camp,review,point,category,avg_point
0,진달래 관광농원 캠핑장,데크가 너무 작아요,3.0,메인시설,4.3
1,진달래 관광농원 캠핑장,따뜻한물 잘 나와요,4.0,메인시설,5.0


In [247]:
# 가중치 [ point / (point / avg_point) ] * 0.01 → RobustScaler 적용
reviews_df['weights'] = reviews_df['point'] * (reviews_df['point'] / reviews_df['avg_point'])
reviews_df = reviews_df.reset_index(drop=True)

rb = RobustScaler()
rb_df = rb.fit_transform(reviews_df[['weights']])
rb_df = pd.DataFrame(rb_df)

In [248]:
rb_df.describe()

Unnamed: 0,0
count,129840.0
mean,-0.368762
std,1.216199
min,-4.263409
25%,-0.852682
50%,0.0
75%,0.147318
max,17.053634


In [249]:
rb_df = rb_df.rename(columns={0:'weights2'})
rb_df['weights2'] = rb_df['weights2'] * 0.01

re_df = pd.concat([reviews_df, rb_df],1)

# final_point: point * (1+weights) → MinMaxScaler 적용 후 *5 (0~5 사이의 값)

re_df['final_point'] = re_df['point'] * (1 + re_df['weights2'])
re_df.head(2)

Unnamed: 0,camp,review,point,category,avg_point,weights,weights2,final_point
0,진달래 관광농원 캠핑장,데크가 너무 작아요,3.0,메인시설,4.3,2.093023,-0.024787,2.925638
1,진달래 관광농원 캠핑장,따뜻한물 잘 나와요,4.0,메인시설,5.0,3.2,-0.015348,3.938607


In [250]:
mm = MinMaxScaler()
mm_df= mm.fit_transform(re_df[['final_point']])
mm_df = pd.DataFrame(mm_df)


re_df['final_point'] = mm_df * 5
re_df = re_df.drop(['weights','weights2'],1)
re_df['final_point'] = round(re_df['final_point'],1)

In [251]:
re_df.head(2)

Unnamed: 0,camp,review,point,category,avg_point,final_point
0,진달래 관광농원 캠핑장,데크가 너무 작아요,3.0,메인시설,4.3,2.5
1,진달래 관광농원 캠핑장,따뜻한물 잘 나와요,4.0,메인시설,5.0,3.4


In [252]:
re_df2 = re_df.groupby(['camp','category']).mean().reset_index()
re_df3 = re_df.groupby(['camp', 'category']).size().reset_index(name='count')
re_df4 = pd.merge(re_df2, re_df3)
re_df4.head(2)

Unnamed: 0,camp,category,point,avg_point,final_point,count
0,(주)데일리랜드,가격,5.0,4.7,4.3,1
1,(주)데일리랜드,만족도,4.375,4.535,3.74,20


In [222]:
#re_df4.to_csv('review_final_point.csv', encoding='utf-8-sig', index=False)