#  3. 특정 업체(화양156)의 메뉴 긍정 리뷰 분석

## 목차 

 
    1. 화양156 데이터 전처리
    2. 화양156의 긍정 리뷰 데이터 추출
    3. 긍정 리뷰 데이터의 LSA이용한 유사도 측정 
    4. 코사인 유사도 이용하여 특정 메뉴에 대한 유사도 높은 리뷰 추출
    5. 결과

In [1]:
import pandas as pd
import networkx as nx 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

import re
from wordcloud import WordCloud 
from wordcloud import STOPWORDS

from konlpy.tag import *
from konlpy.tag import Okt
import konlpy
import nltk
import pandas as pd
from math import log

import numpy as np
from numpy import array
from numpy import dot
from numpy.linalg import norm
from scipy.linalg import svd

In [2]:
# 윈도우의 한글 폰트 설정
plt.rc('font', family = 'Malgun Gothic')

# 시각화 그래프가 노트북 안에 보이게 하기
%matplotlib inline
plt.rcParams['axes.unicode_minus'] = False

# warning 메세지 
warnings.simplefilter("ignore")

## 1. 화양156 데이터 전처리

In [3]:
# data확인
df = pd.read_csv('./data/data_review.csv',encoding='utf-8')
df

Unnamed: 0,업체명,카테고리,메뉴,맛,양,배달,리뷰,date
0,전주석쇠불고기-본점,한식,파절이매콤통삼겹（2~3인）（공기밥2＋김치찌개＋밑반찬＋쌈）/1,5.0,5.0,5.0,자주시켜먹는 단골집인데 항상변치않고 맛있습니다! !!,2017년 12월 6일 수요일
1,전주석쇠불고기-본점,한식,통삼겹살 2人（고기＋공기밥2＋김치찌개＋쌈＋밑반찬）/1(추가 선택(고기 추가)),5.0,5.0,5.0,"배달 시간도 오래걸리지 않고, 양이 적을거 같아서 고기 추가를 했는데..안해도 됐었...",2017년 9월 30일 토요일
2,전주석쇠불고기-본점,한식,통삼겹살 2人（고기＋공기밥2＋김치찌개＋쌈＋밑반찬）/1,,,,굿굿,2017년 9월 23일 토요일
3,전주석쇠불고기-본점,한식,통삼겹（小／500g）（냉열무국수 or 냉열무우동 or 비빔열무국수＋쌈）/1(메뉴 선...,5.0,5.0,5.0,배달도빠르고맛나요ㅁ,2018년 3월 13일 화요일
4,전주석쇠불고기-본점,한식,통삼겹（3~4인）（김치찌개＋공기밥3＋밑반찬＋쌈）/1,5.0,5.0,5.0,찌개와 통삼겹 맛있는 5찬. 많이 먹는편이라 3인분짜리 주문했는데... 배터지는줄 ...,2018년 2월 27일 화요일
...,...,...,...,...,...,...,...,...
28912,동강,중식,,,,,좋아요,2013년 12월 31일 화요일
28913,동강,중식,,,,,맛있ㅇㅓ욧,2013년 12월 29일 일요일
28914,동강,중식,,,,,맛이게다,2013년 12월 25일 수요일
28915,동강,중식,,,,,여기 맛있어요 이근처 탕수육은 냄새?ㅜㅜ돼지냄새?나는 곳이 너무 많은데 여긴안그래요...,2013년 12월 10일 화요일


## 2. 화양156의 긍정 리뷰 데이터 추출

In [4]:
# 화양156 리뷰 데이터
df_HW = df[df['업체명'] == '화양156']

# 화양156 점수있는 데이터
df_HW = df_HW[df_HW['맛'].notnull() & df_HW['양'].notnull() & df_HW['배달'].notnull()]

In [5]:
df_HW['총점'] = df_HW['맛']+ df_HW['양']+df_HW['배달']
df_HW_Good = df_HW[ ( df_HW['총점'] >= 12 )& (df_HW['맛'] >= 4 )]
df_HW_Good['리뷰'] =df_HW_Good['리뷰'].str.replace('[^a-zA-Z0-9가-힣\s]', '', regex=True)
df_HW_Good

Unnamed: 0,업체명,카테고리,메뉴,맛,양,배달,리뷰,date,총점
23215,화양156,분식,"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(치즈),맛 선택(순한맛),...",4.0,5.0,5.0,맛있었어요 배달도 빠르게 잘왔구요 떡이 생각보다 딱딱한 느낌이 좀있었어요,2018년 4월 28일 토요일,14.0
23216,화양156,분식,"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(차돌),맛 선택(매운맛),...",5.0,5.0,5.0,정말 푸짐하고 맛있는데다 샐러드서비스에 친절하시기까지 앞으로 떡볶이는 여기서만 시킬...,2018년 4월 3일 화요일,15.0
23217,화양156,분식,"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(라면),맛 선택(순한맛),...",4.0,3.0,5.0,크림 떡볶이를 시켰는데 첫 맛은 좋았으나 중간부터 너무 느끼해서 결국 못 먹고 버렸...,2018년 4월 15일 일요일,12.0
23218,화양156,분식,"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(떡 추가),맛 선택(매운맛...",5.0,5.0,5.0,맛있어요 생각보다 양이 많아서 놀랐습니다 다음에도 또 시킬게오,2018년 5월 5일 토요일,15.0
23219,화양156,분식,"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이))",4.0,4.0,4.0,리뷰보고 주문했는데실망아니라서 좋았어요 일단 아들이 매운맛을 잘 못 먹어서순한맛으로...,2018년 5월 22일 화요일,12.0
...,...,...,...,...,...,...,...,...,...
23760,화양156,분식,,4.0,4.0,4.0,크림떡볶이 파스타 추가해서 그런지 소스 많이 요청했는데도 소스가 없어서 너무 뻑뻑해...,2018년 5월 26일 토요일,12.0
23761,화양156,분식,,5.0,5.0,5.0,처음시켜먹어봤는데 진짜 앞으로 자주시켜 먹을거같아요 맛양 둘다 만족했어요,2018년 3월 31일 토요일,15.0
23765,화양156,분식,,5.0,5.0,5.0,주문누락때문에 오래걸렸지만 전화드리자마자 10분만에왔네요 일부러늦으신것도 아닌데 서...,2017년 12월 10일 일요일,15.0
23766,화양156,분식,,5.0,5.0,5.0,4번째먹는데 언제나 맛잇어용,2017년 11월 12일 일요일,15.0


In [6]:
df_HW_Good['menu_review'] = df_HW_Good['메뉴'] +' '+df_HW_Good['리뷰']

In [7]:
df_HW_Good['menu_review'] = df_HW_Good['menu_review'].astype('str')

In [8]:
list(df_HW_Good['menu_review'])[0]

'화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(치즈),맛 선택(순한맛),떡볶이 선택(화양떡볶이)) 맛있었어요 배달도 빠르게 잘왔구요 떡이 생각보다 딱딱한 느낌이 좀있었어요'

In [9]:
df_HW_Good.메뉴.unique()

array(['화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(치즈),맛 선택(순한맛),떡볶이 선택(화양떡볶이))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(차돌),맛 선택(매운맛),떡볶이 선택(차돌떡볶이),주먹밥 변경(주먹밥을 볶음밥으로 변경))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(라면),맛 선택(순한맛),떡볶이 선택(크림떡볶이))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(떡 추가),맛 선택(매운맛),떡볶이 선택(차돌떡볶이))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(차돌떡볶이))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(매운맛),떡볶이 선택(화양떡볶이),주먹밥 변경(주먹밥을 볶음밥으로 변경))',
       '화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(매운맛),떡볶이 선택(화양떡볶이))',
       '화양세트2（떡볶이＋볶음밥＋샐러드＋음료）/1(토핑 추가 선택(파스타),맛 선택(순한맛),떡볶이 선택(간장떡볶이))',
       '화양세트2（떡볶이＋볶음밥＋샐러드＋음료）/1(토핑 추가 선택(치즈,차돌),맛 선택(순한맛),떡볶이 선택(차돌떡볶이))',
       '화양세트2（떡볶이＋볶음밥＋샐러드＋음료）/1(토핑 추가 선택(치즈),맛 선택(매운맛),떡볶이 선택(화양떡볶이))',
       '화양세트2（떡볶이＋볶음밥＋샐러드＋음료）/1(토핑 추가 선택(우동),맛 선택(순한맛),떡볶이 선택(화양떡볶이))',
       '화양세트2（떡볶이＋볶음밥＋샐러드＋음료）/1(토핑 추가 선택(어묵 추가),맛 선택(매운맛),떡볶이 선택(화양떡볶이))',
       '화양세트2（떡볶이＋볶음밥＋샐러드＋음료）/1(토핑 추가 선택(비엔나,우동),맛 선택(순한맛),떡볶이 선택(차돌떡볶이

In [10]:
ko_text = ''
for i in df_HW_Good['menu_review']:
    ko_text += str(i)+' '

In [11]:
okt = Okt()
words = okt.nouns(ko_text)

In [12]:
ko_menu = ''
for i in df_HW_Good['메뉴']:
    ko_menu += str(i)+' '
okt = Okt()
menu = okt.nouns(ko_menu)
len(menu)

7035

## 3. 긍정 리뷰 데이터의 LSA이용한 유사도 측정

In [None]:
'''
words = konlpy.tag.Twitter().pos(ko_text)

parser = nltk.RegexpParser('NP: {<N.*>*<Suffix>?}')
chunks = parser.parse(words)

noun_chunks = []
for subtree in chunks.subtrees():
    if subtree.label()=='NP':
        noun_chunks.append(' '.join((e[0] for e in list(subtree))))
for i in noun_chunks:
    texts = i+' '
'''

In [13]:
okt = Okt()
words = okt.nouns(ko_text)

In [14]:
df_HW_Good_reviews = pd.DataFrame(columns= (set(words)))
df_HW_Good_reviews
for i in df_HW_Good['리뷰']:
    row = []
    one_review = i
    for noun in df_HW_Good_reviews.columns:
        row.append(one_review.count(noun))
    df_HW_Good_reviews.loc[len(df_HW_Good_reviews)] = row
df_HW_Good_reviews =  df_HW_Good_reviews.astype('float')

In [15]:
df_HW_Good_reviews.index= df_HW_Good['메뉴'].astype('str')

In [16]:
df_HW_Good_reviews

Unnamed: 0_level_0,세분,란,작년,그게,비쥬,불지,존맛,음,임신,한번,...,남,이적,듬,토핑,잘먹엇슴다,왓어,향,먹음,습,거
메뉴,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(치즈),맛 선택(순한맛),떡볶이 선택(화양떡볶이))",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(차돌),맛 선택(매운맛),떡볶이 선택(차돌떡볶이),주먹밥 변경(주먹밥을 볶음밥으로 변경))",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(라면),맛 선택(순한맛),떡볶이 선택(크림떡볶이))",0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(떡 추가),맛 선택(매운맛),떡볶이 선택(차돌떡볶이))",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이))",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
len(words)

11258

In [18]:
df_HW_Terms_reivews = df_HW_Good_reviews.T
N = len(df_HW_Good_reviews)
# df_t = []
idf = []
for tf in df_HW_Terms_reivews.values:
    tf = tf.tolist().count(0)
    idf.append(round(log( N / (1+(N-tf)) ),4))
idf[:5]

[5.5195, 4.2667, 5.5195, 5.5195, 5.5195]

In [19]:
import numpy as np
idf= np.array(idf)
df_idf = pd.DataFrame(data=idf.transpose(), index=df_HW_Terms_reivews.index, columns=['idf'])

In [20]:
df_idf

Unnamed: 0,idf
세분,5.5195
란,4.2667
작년,5.5195
그게,5.5195
비쥬,5.5195
...,...
왓어,5.5195
향,3.5735
먹음,5.5195
습,1.2290


In [21]:
df_HW_Terms_reivews

메뉴,"화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(치즈),맛 선택(순한맛),떡볶이 선택(화양떡볶이))","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(차돌),맛 선택(매운맛),떡볶이 선택(차돌떡볶이),주먹밥 변경(주먹밥을 볶음밥으로 변경))","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(라면),맛 선택(순한맛),떡볶이 선택(크림떡볶이))","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(토핑 추가 선택(떡 추가),맛 선택(매운맛),떡볶이 선택(차돌떡볶이))","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이))","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이)).1","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이)).2","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이)).3","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(순한맛),떡볶이 선택(차돌떡볶이))","화양세트3（떡볶이＋튀김＋주먹밥＋음료）/1(맛 선택(매운맛),떡볶이 선택(화양떡볶이),주먹밥 변경(주먹밥을 볶음밥으로 변경))",...,간장 떡볶이/1,간장 떡볶이/1.1,"YOLO 떡볶이/1(토핑 추가 선택(차돌),맛 선택(순한맛),떡볶이 선택(차돌떡볶이)),야채튀김/1","YOLO 떡볶이/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이)),셀프 주먹밥/1,새우볶음밥/2","YOLO 떡볶이/1(맛 선택(순한맛),떡볶이 선택(화양떡볶이)),김말이 /1,야채튀김/1,통새우튀김/1,순대튀김 /1,오징어튀김 /1",nan,nan.1,nan.2,nan.3,nan.4
세분,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
란,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
작년,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
그게,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
비쥬,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
왓어,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
향,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
먹음,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
습,0.0,0.0,2.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3.0


In [22]:
# TF-IDF matrix 생성
tf_idf = pd.DataFrame(index = df_HW_Terms_reivews.index, columns=range(N))
tf_idf
for i in range(len(df_HW_Terms_reivews.index)):
    tf_idf.iloc[i] = round(df_HW_Terms_reivews.iloc[i]*idf[i],4)

In [23]:
tf_idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,489,490,491,492,493,494,495,496,497,498
세분,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
란,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
작년,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
그게,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
비쥬,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
왓어,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
향,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
먹음,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
습,0.0,0.0,2.458,1.229,0.0,1.229,0.0,1.229,0.0,0.0,...,0.0,0.0,0.0,1.229,0.0,0.0,0.0,1.229,0.0,3.687


In [24]:
# SVD 계산하기 위해 df를 numpy로, data type을 float으로 설정했습니다.
TF_IDF = tf_idf.to_numpy( dtype='float')
TF_IDF

array([[0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 0.    ],
       [0.    , 0.    , 2.458 , ..., 1.229 , 0.    , 3.687 ],
       [0.    , 0.    , 0.    , ..., 0.    , 0.    , 1.8951]])

In [25]:
# Singular-Value Decomposition

# SVD
U, sigma, VT = svd(TF_IDF)

print('U : ' ,U,'\n')
print('시그마 : ',sigma,'\n')
print('V^T : ',VT,'\n')

U :  [[-5.63872284e-03 -8.38193498e-03 -1.13216119e-03 ... -1.72862947e-03
   7.45903409e-02  2.63568915e-02]
 [-2.25743646e-02  8.55392620e-04  1.02072317e-02 ...  3.04909199e-03
  -1.76392011e-02 -2.02068478e-02]
 [-1.16157143e-03 -1.91698066e-03  2.05023302e-03 ... -9.26740395e-04
  -1.57305486e-02  1.77971756e-02]
 ...
 [-2.58570132e-03 -3.86751668e-03 -8.95509912e-04 ...  8.06131694e-01
  -3.10471218e-02  1.96802244e-02]
 [-1.04043682e-01 -9.59974609e-02 -1.22868190e-02 ... -5.20417043e-18
   2.77555756e-17  2.38524478e-18]
 [-1.15463872e-01 -5.50612724e-03 -3.85350258e-03 ...  3.46944695e-18
   1.04083409e-17  4.16333634e-17]] 

시그마 :  [1.05109240e+02 5.29059140e+01 4.74875774e+01 4.34617949e+01
 4.20292125e+01 4.05498179e+01 3.93877803e+01 3.85466052e+01
 3.76434697e+01 3.71778214e+01 3.66718181e+01 3.51078011e+01
 3.47915862e+01 3.44248219e+01 3.41136702e+01 3.35429359e+01
 3.30686438e+01 3.24481999e+01 3.20933897e+01 3.15597029e+01
 3.13924359e+01 3.11736469e+01 3.06559241e+01

In [26]:
Sigma = np.diag(sigma)
Sigma

U_df = pd.DataFrame(U)
U_df

Sigma_df = pd.DataFrame(Sigma)
Sigma_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,489,490,491,492,493,494,495,496,497,498
0,105.10924,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
1,0.00000,52.905914,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
2,0.00000,0.000000,47.487577,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
3,0.00000,0.000000,0.000000,43.461795,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
4,0.00000,0.000000,0.000000,0.000000,42.029212,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.127105e-15,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00
495,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,8.127105e-15,0.000000e+00,0.000000e+00,0.000000e+00
496,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,8.127105e-15,0.000000e+00,0.000000e+00
497,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000e+00,0.000000e+00,0.000000e+00,8.127105e-15,0.000000e+00


In [27]:
# pick two left singular vectors
U_df.iloc[:, :2]

Unnamed: 0,0,1
0,-0.005639,-0.008382
1,-0.022574,0.000855
2,-0.001162,-0.001917
3,-0.001387,-0.001626
4,-0.002003,-0.000641
...,...,...
956,-0.005632,0.001531
957,-0.025211,-0.017177
958,-0.002586,-0.003868
959,-0.104044,-0.095997


In [28]:
# The first two singular values
Sigma_df.iloc[:2,:2]

Unnamed: 0,0,1
0,105.10924,0.0
1,0.0,52.905914


In [29]:
LSA= U_df.iloc[:, :2].dot(Sigma_df.iloc[:2,:2])
LSA

Unnamed: 0,0,1
0,-0.592682,-0.443454
1,-2.372774,0.045255
2,-0.122092,-0.101420
3,-0.145783,-0.086051
4,-0.210534,-0.033890
...,...,...
956,-0.592003,0.081011
957,-2.649937,-0.908755
958,-0.271781,-0.204615
959,-10.935952,-5.078833


In [30]:
LSA = LSA.set_axis(df_HW_Terms_reivews.index, axis='index')
LSA

Unnamed: 0,0,1
세분,-0.592682,-0.443454
란,-2.372774,0.045255
작년,-0.122092,-0.101420
그게,-0.145783,-0.086051
비쥬,-0.210534,-0.033890
...,...,...
왓어,-0.592003,0.081011
향,-2.649937,-0.908755
먹음,-0.271781,-0.204615
습,-10.935952,-5.078833


In [31]:
#코사인유사도를 사용해 모든 similarity 계산  
for n in range(len(LSA)-1):
    for n2 in range(n+1, len(LSA)):
        a,b = LSA.iloc[n], LSA.iloc[n2]
        CS = round(dot(a, b)/(norm(a)*norm(b)),4)

In [32]:
df_HW_Terms_reivews_CS = pd.DataFrame(columns=df_HW_Terms_reivews.index, index=df_HW_Terms_reivews.index)
for n in range(len(LSA)-1):
    for n2 in range(n+1, len(LSA)):
        a,b = LSA.iloc[n], LSA.iloc[n2]
        df_HW_Terms_reivews_CS.iloc[n,n2] = round(dot(a, b)/(norm(a)*norm(b)),4)

In [33]:
df_HW_Terms_reivews_CS

Unnamed: 0,세분,란,작년,그게,비쥬,불지,존맛,음,임신,한번,...,남,이적,듬,토핑,잘먹엇슴다,왓어,향,먹음,습,거
세분,,0.7891,0.9987,0.9941,0.8857,0.9976,0.7476,0.5758,0.0092,0.0994,...,0.5203,0.9449,,0.4098,0.8442,0.7121,0.9517,1.0,0.9785,0.8148
란,,,0.7569,0.8513,0.9841,0.8296,0.9979,0.9566,0.6215,0.6896,...,0.9351,0.9467,,0.8837,0.9954,0.9932,0.9396,0.7873,0.8988,0.9991
작년,,,,0.9872,0.861,0.9928,0.7129,0.5335,-0.0416,0.0487,...,0.4762,0.927,,0.363,0.8159,0.6755,0.9349,0.9989,0.9668,0.7843
그게,,,,,0.931,0.9992,0.8155,0.6614,0.118,0.2072,...,0.6102,0.9749,,0.5067,0.8976,0.7843,0.9795,0.9937,0.9952,0.8731
비쥬,,,,,,0.9156,0.9705,0.8895,0.4723,0.55,...,0.8573,0.9889,,0.7864,0.9966,0.9566,0.9855,0.8843,0.9624,0.9908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
왓어,,,,,,,,,,,...,,,,,,,0.8932,0.71,0.8415,0.9872
향,,,,,,,,,,,...,,,,,,,,0.9508,0.9946,0.9534
먹음,,,,,,,,,,,...,,,,,,,,,0.9779,0.8131
습,,,,,,,,,,,...,,,,,,,,,,0.9168


In [34]:
df_HW_Terms_reivews_CS = df_HW_Terms_reivews_CS.fillna(df_HW_Terms_reivews_CS.T)

In [35]:
df_HW_Terms_reivews_CS_a = df_HW_Terms_reivews_CS.copy()


## 4. 코사인 유사도 이용하여 특정 메뉴에 대한 유사도 높은 리뷰 추출

In [36]:
 df_HW_Terms_reivews_CS_a.columns

Index(['세분', '란', '작년', '그게', '비쥬', '불지', '존맛', '음', '임신', '한번',
       ...
       '남', '이적', '듬', '토핑', '잘먹엇슴다', '왓어', '향', '먹음', '습', '거'],
      dtype='object', length=961)

In [37]:
for i in df_HW_Terms_reivews_CS_a.columns:
    if df_HW_Terms_reivews_CS_a[i].sum()==0:
        df_HW_Terms_reivews_CS_a= df_HW_Terms_reivews_CS_a.drop(columns=i, index=i)
df_HW_Terms_reivews_CS_a

Unnamed: 0,세분,란,작년,그게,비쥬,불지,존맛,음,임신,한번,...,종종,남,이적,토핑,잘먹엇슴다,왓어,향,먹음,습,거
세분,,0.7891,0.9987,0.9941,0.8857,0.9976,0.7476,0.5758,0.0092,0.0994,...,0.9989,0.5203,0.9449,0.4098,0.8442,0.7121,0.9517,1.0,0.9785,0.8148
란,0.7891,,0.7569,0.8513,0.9841,0.8296,0.9979,0.9566,0.6215,0.6896,...,0.8174,0.9351,0.9467,0.8837,0.9954,0.9932,0.9396,0.7873,0.8988,0.9991
작년,0.9987,0.7569,,0.9872,0.861,0.9928,0.7129,0.5335,-0.0416,0.0487,...,0.9952,0.4762,0.927,0.363,0.8159,0.6755,0.9349,0.9989,0.9668,0.7843
그게,0.9941,0.8513,0.9872,,0.931,0.9992,0.8155,0.6614,0.118,0.2072,...,0.9981,0.6102,0.9749,0.5067,0.8976,0.7843,0.9795,0.9937,0.9952,0.8731
비쥬,0.8857,0.9841,0.861,0.931,,0.9156,0.9705,0.8895,0.4723,0.55,...,0.9068,0.8573,0.9889,0.7864,0.9966,0.9566,0.9855,0.8843,0.9624,0.9908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
왓어,0.7121,0.9932,0.6755,0.7843,0.9566,0.7588,0.9986,0.984,0.7086,0.7694,...,0.7446,0.9701,0.9027,0.9323,0.9775,,0.8932,0.71,0.8415,0.9872
향,0.9517,0.9396,0.9349,0.9795,0.9855,0.9706,0.9154,0.7989,0.3157,0.4,...,0.9652,0.7573,0.9998,0.67,0.968,0.8932,,0.9508,0.9946,0.9534
먹음,1.0,0.7873,0.9989,0.9937,0.8843,0.9974,0.7457,0.5733,0.0062,0.0964,...,0.9987,0.5177,0.9439,0.4071,0.8426,0.71,0.9508,,0.9779,0.8131
습,0.9785,0.8988,0.9668,0.9952,0.9624,0.9904,0.8685,0.7319,0.2151,0.3023,...,0.9872,0.6851,0.9921,0.589,0.9366,0.8415,0.9946,0.9779,,0.9168


In [38]:
term_list =[]
for i in df_HW_Terms_reivews_CS_a.columns:
    if '떡볶' in i:
        term_list.append(i)
term_list 

['떡볶이', '떡볶기']

In [39]:
for i in term_list :
    print('"'+i+'"','메뉴의 리뷰 키워드(명사)')
    print(list(df_HW_Terms_reivews_CS_a.columns[df_HW_Terms_reivews_CS_a[i].values>0.999]),'\n')
    

"떡볶이" 메뉴의 리뷰 키워드(명사)
['월요일', '단골', '사이다', '짜장', '기억', '의향', '풀때', '리코', '도', '드레싱', '대도', '것', '캊', '서비스', '아이디어', '용기', '식지', '프랜차이즈', '흠', '수도', '추', '크림', '둘', '차이', '편이', '듭니', '요금', '좠', '살짝', '바로', '일품', '괜찬구', '로', '불맛', '정도', '꽤', '세명', '문자', '떡', '내심', '취향', '런가', '가격', '거기', '돈가스', '요청', '치킨', '선', '쪼', '달라', '단', '낱개', '덕', '요구사항', '구성', '까르보나라', '소시지', '진짜진짜', '쪼금', '콤달콤', '적', '서비슨데', '불리', '휴무', '달', '양호', '불고기', '이채', '건강', '변치', '튀김', '재', '조리', '양도', '하나', '짱', '통', '가성', '조만간', '멘붕', '그저께', '만족도', '굳이', '친구', '다시', '비', '굿굿', '다음', '걸', '확인', '이적', '향'] 

"떡볶기" 메뉴의 리뷰 키워드(명사)
['떡뽁이', '비도', '밸런스', '예상', '주심', '일률', '정시', '아들', '은근', '야기', '실망', '반전', '차돌박이', '단무지', '구매', '중됵', '냄새', '의사', '딸', '컬', '미니', '떡뽂', '먾다', '뎁', '드링킹'] 



In [40]:
for i in term_list :
    print('"'+i+'"','메뉴의 리뷰 키워드(명사)')
    print(list(df_HW_Terms_reivews_CS_a[ df_HW_Terms_reivews_CS_a[i] ==df_HW_Terms_reivews_CS_a[i].max()].index),'\n')

"떡볶이" 메뉴의 리뷰 키워드(명사)
['단골', '기억', '도', '드레싱', '서비스', '흠', '둘', '듭니', '살짝', '괜찬구', '로', '정도', '내심', '취향', '가격', '돈가스', '쪼', '단', '까르보나라', '진짜진짜', '쪼금', '적', '변치', '재', '양도', '하나', '조만간', '그저께', '만족도', '굳이'] 

"떡볶기" 메뉴의 리뷰 키워드(명사)
['밸런스', '은근', '냄새', '뎁'] 



In [44]:
term_list =[]
for i in df_HW_Terms_reivews_CS_a.columns:
    if '김말이' in i:
        term_list.append(i)
print(term_list,'\n')

# for i in term_list :
#     print('"'+i+'"','메뉴의 리뷰 키워드(명사)')
#     print(list(df_HW_Terms_reivews_CS_a.columns[df_HW_Terms_reivews_CS_a[i].values>0.999]),'\n')
    

for i in term_list :
    print('"'+i+'"','메뉴의 리뷰 키워드(명사)')
    print(list(df_HW_Terms_reivews_CS_a[ df_HW_Terms_reivews_CS_a[i] ==df_HW_Terms_reivews_CS_a[i].max()].index),'\n')

['김말이'] 

"김말이" 메뉴의 리뷰 키워드(명사)
['나머지', '끈', '기도', '머리카락', '즉떡', '거의', '세번', '쓰기', '다섯', '이집', '무엇', '플라스틱', '조금', '양심', '결', '맛보기', '와중', '찌개', '크림소스', '사리', '강추', '전형', '남자친구', '빼'] 



In [46]:
term_list =[]
for i in df_HW_Terms_reivews_CS_a.columns:
    if '주먹밥' in i:
        term_list.append(i)
print(term_list,'\n')

# for i in term_list :
#     print('"'+i+'"','메뉴의 리뷰 키워드(명사)')
#     print(list(df_HW_Terms_reivews_CS_a.columns[df_HW_Terms_reivews_CS_a[i].values>0.999]),'\n')
    

for i in term_list :
    print('"'+i+'"','메뉴의 리뷰 키워드(명사)')
    print(list(df_HW_Terms_reivews_CS_a[ df_HW_Terms_reivews_CS_a[i] ==df_HW_Terms_reivews_CS_a[i].max()].index),'\n')

['주먹밥'] 

"주먹밥" 메뉴의 리뷰 키워드(명사)
['면', '콜라', '끼', '혼밥', '군데', '해도', '일부러', '고생', '존', '먼저', '정리', '음료', '영', '다이어트', '보시', '이제', '짱맛'] 



## 5. 결과

화양156의 데이터에서 총점(맛+양+배달) 12점 이상, 맛 4점 이상의 데이터를 긍정 리뷰 데이터로 판단했습니다.

해당 리뷰의 데이터를 전처리 후 tf-idf 생성하고 LSA를 시행했습니다.

이를 통해 각 리뷰 내용(명사 구)에 대한 코사인 유사도를 구할 수 있었고,
입력 메뉴에 대한 리뷰를 확인할 수 있었습니다.