# 데이터 로딩

In [1]:
import pandas as pd
import glob, os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 700)

path = r'.\dataset\topics'
all_files = glob.glob(os.path.join(path, '*.data'))  # 해당 경로(특정 디렉토리)에 있는 *.data 패턴을 가진 파일을 검색할 때 사용
filename_list = []
opinion_text = []

for file in all_files:
    df = pd.read_table(file, encoding='latin1')   # tab으로 구분된 데이터 읽어옴

    filename = file.split('\\')[-1]  # \로 구분된 마지막 값이 파일명, 경로를 제외한 파일명 추출
    filename = filename.split('.')[0]  # 확장자를 제외한 파일명 추출

    filename_list.append(filename)   # 파일명 리스트에 저장
    opinion_text.append(df.to_string())  # 후기에 대한 텍스트를 문자열로 바꿔서 리스트에 저장

document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ..."
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...
3,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ..."
4,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One..."


# 어근 변환 함수 선언

In [2]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
from nltk.stem import WordNetLemmatizer
import nltk
import string   # 구두점 정보

# 문장에 포함된 구두점을 삭제하기 위한 매핑 정보 생성
# ord(): 매개변수로 전달된 문자에 해당하는 유니코드를 반환 <--> char(): 매개변수에 유니코드를 넣어주면 해당하는 문자를 반환
remove_punct_dict = {ord(punct):None for punct in string.punctuation} # {원본 문자 유니코드:변경 문자 유니코드}, 변경문자유니코드->none으로 넣어 삭제

lemmar = WordNetLemmatizer()

# 입력으로 문장을 받아서 stop words 제거 -> 소문자로 변환 -> 단어로 토큰화 -> 어근 변환
def LemNormalize(text):
    tokens = nltk.word_tokenize(text.lower().translate(remove_punct_dict))
    return [lemmar.lemmatize(token) for token in tokens]

## translate() 실습
- 문자열 내에 특정 문자를 다른 문자로 일괄 변경하는 함수
- 매개변수로 변경하고자 하는 정보를 딕셔너리로 전달
  - {원본 문자 유니코드:변경 문자 유니코드}

In [4]:
# 원래 문자와 바꿀 문자에 대한 맵핑정보(딕셔너리)를 만드는 함수
map = str.maketrans('[],','   ')  # '원본 문자열들', '변경할 문자열들'
print(map)  # 모두 유니코드로 반환
text = '[cat, dog, squirrel, rabbit]'
text = text.translate(map)
print(text)

{91: 32, 93: 32, 44: 32}
 cat  dog  squirrel  rabbit 


# TF-IDF 기반 Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tokenizer: 토큰화를 별도의 사용자정의 함수로 지정하는 파라메터
# min_df: 전체 문서에 걸쳐 너무 낮은 빈도수를 가지는 단어는 피처 추출에서 제외시킴
#         실수값으로 지정시, 지정한 백분율 이하 빈도수로 등장하는 단어까지는 추출에서 제외
tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1,2), min_df=0.05, max_df=0.85)
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

In [10]:
feature_vect.shape

(51, 4611)

In [14]:
import numpy as np

df = pd.DataFrame(feature_vect.toarray())
df.head()
df['a'] = np.random.randint(1,10,51)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4602,4603,4604,4605,4606,4607,4608,4609,4610,a
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026967,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,0.0,0.0,0.0,0.008648,0.0,0.0,0.0,0.0,0.017297,0.0,...,0.011735,0.0,0.010701,0.0,0.0,0.0,0.0,0.012075,0.0,8
2,0.0,0.0,0.0,0.0,0.0,0.0,0.021119,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
3,0.0,0.0,0.0,0.012681,0.0,0.0,0.0,0.0,0.012681,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,0.058261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


# 군집 분석
- 군집 수를 5개로 지정

In [7]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [8]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",3
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,3
3,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",2
4,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",2


In [9]:
document_df[document_df['cluster_label'] == 0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
50,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
13,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0
14,display_garmin_nuvi_255W_gps,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",0
10,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",0
38,satellite_garmin_nuvi_255W_gps,"It's fast to acquire satellites .\n0 If you've ever had a Brand X GPS take you on some strange route that adds 20 minutes to your trip, has you turn the wrong way down a one way road, tell you to turn AFTER you've passed the street, frequently loses the satellite signal, or has old maps missing streets, you know how important this stuff is .\n1 ...",0
19,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...,0
22,speed_windows7,"Windows 7 is quite simply faster, more stable, boots faster, goes to sleep faster, comes back from sleep faster, manages your files better and on top of that it's beautiful to look at and easy to use .\n0 , faster about 20% to 30% faster at running applications than my Vista , seriously\n1 ...",0
29,updates_garmin_nuvi_255W_gps,"Another thing to consider was that I paid $50 less for the 750 and it came with the FM transmitter cable and a USB cord to connect it to your computer for updates and downloads .\n0 update and reroute much _more_ quickly than my other GPS .\n1 UPDATE ON THIS , It finally turned out that to see the elevation contours at lowe...",0
25,video_ipod_nano_8gb,"I bought the 8, gig Ipod Nano that has the built, in video camera .\n0 Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\n1 ...",0
27,voice_garmin_nuvi_255W_gps,The voice prompts and maps are wonderful especially when driving after dark .\n0 I also thought the the voice prompts of the 750 where more pleasant sounding than the 255w's .\n1 ...,0


In [15]:
document_df[document_df['cluster_label'] == 1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
46,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1
49,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is nice to meet other travellers and great having access to the free Internet access in our room .\n0 They also have a computer available with free internet which is a nice bonus but I didn't find that out till the day before we left but was still able to get on there to check our flight to Vegas the next day .\n1 ...,1
35,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",1
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1
48,room_holiday_inn_london,"We arrived at 23,30 hours and they could not recommend a restaurant so we decided to go to Tesco, with very limited choices but when you are hingry you do not careNext day they rang the bell at 8,00 hours to clean the room, not being very nice being waken up so earlyEvery day they gave u...",1
41,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H...",1
40,rooms_swissotel_chicago,"The Swissotel is one of our favorite hotels in Chicago and the corner rooms have the most fantastic views in the city .\n0 The rooms look like they were just remodled and upgraded, there was an HD TV and a nice iHome docking station to put my iPod so I could set the alarm to wake up with my music instead of the radio .\n1 ...",1
47,staff_bestwestern_hotel_sfo,Staff are friendl...,1
28,staff_swissotel_chicago,"The staff at Swissotel were not particularly nice .\n0 Each time I waited at the counter for staff for several minutes and then was waved to the desk upon my turn with no hello or anything, or apology for waiting in line .\n1 ...",1


# 군집 분석
- 군집 수를 3개로 지정

In [16]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [17]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",1
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,0
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,1
3,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",2
4,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",2


- 클러스터 0 은 호텔에 대한 리뷰로 군집

In [18]:
document_df[document_df['cluster_label'] == 0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
46,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",0
11,food_holiday_inn_london,The room was packed to capacity with queues at the food buffets .\n0 The over zealous staff cleared our unfinished drinks while we were collecting cooked food and movement around the room with plates was difficult in the crowded circumstances .\n1 ...,0
8,food_swissotel_chicago,The food for our event was delicious .\n0 ...,0
49,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is nice to meet other travellers and great having access to the free Internet access in our room .\n0 They also have a computer available with free internet which is a nice bonus but I didn't find that out till the day before we left but was still able to get on there to check our flight to Vegas the next day .\n1 ...,0
35,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",0
30,location_holiday_inn_london,"Great location for tube and we crammed in a fair amount of sightseeing in a short time .\n0 All in all, a normal chain hotel on a nice lo...",0
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,0
5,price_holiday_inn_london,"All in all, a normal chain hotel on a nice location , I will be back if I do not find anthing closer to Picadilly for a better price .\n0 ...",0
48,room_holiday_inn_london,"We arrived at 23,30 hours and they could not recommend a restaurant so we decided to go to Tesco, with very limited choices but when you are hingry you do not careNext day they rang the bell at 8,00 hours to clean the room, not being very nice being waken up so earlyEvery day they gave u...",0
41,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H...",0


- 클러스터1은 자동차에 대한 리뷰로 군집

In [19]:
document_df[document_df['cluster_label'] == 1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
16,comfort_honda_accord_2008,"Drivers seat not comfortable, the car itself compared to other models of similar class .\n0 ...",1
15,comfort_toyota_camry_2007,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",1
32,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 ...,1
34,interior_honda_accord_2008,I love the new body style and the interior is a simple pleasure except for the center dash .\n0 ...,1
33,interior_toyota_camry_2007,"First of all, the interior has way too many cheap plastic parts like the cheap plastic center piece that houses the clock .\n0 3 blown struts at 30,000 miles, interior trim coming loose and rattling squeaking, stains on paint, and bug splats taking paint off, premature uneven brake wear, on 3rd windsh...",1
7,mileage_honda_accord_2008,"It's quiet, get good gas mileage and looks clean inside and out .\n0 The mileage is great, and I've had to get used to stopping less for gas .\n1 Thought gas ...",1
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",1
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,1
24,seats_honda_accord_2008,"Front seats are very uncomfortable .\n0 No memory seats, no trip computer, can only display outside temp with trip odometer .\n1 ...",1
26,transmission_toyota_camry_2007,"After slowing down, transmission has to be kicked to speed up .\n0 ...",1


- 클러스터2는 전자기기에 대한 리뷰로 구성

In [20]:
document_df[document_df['cluster_label'] == 2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
50,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",2
44,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",2
45,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,2
43,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",2
42,buttons_amazon_kindle,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",2
13,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,2
14,display_garmin_nuvi_255W_gps,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",2
12,eyesight-issues_amazon_kindle,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",2
10,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",2
9,fonts_amazon_kindle,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",2


# 군집별 핵심 단어 추출하기
- cluster_centers_: 피처(단어)가 군집의 중심으로부터 얼마나 근접한지를 군집 내의 다른 피처들과의 상대 위치로 나타낸 값
- 0~1 사이의 값으로 1에 가까울수록 중심과 가까운 것을 의미

In [22]:
cluster_centers = km_cluster.cluster_centers_
# (3,4611): 군집이 3개, 단어가 4611개로 구성
print(cluster_centers.shape)
print(cluster_centers) # 4611개 단어가 각 군집으로부터 얼마나 멀리 떨어져 있는지

(3, 4611)
[[0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]]


## 군집별 핵심 단어 추출 함수 정의

- 매개변수 목록
  - model: KMeans 모델 객체
  - cluster_data: 리뷰 문서 데이터 및 군집 결과 값을 가지고 있는 데이터프레임 객체
  - feature_names: 피처 카운트 벡터화된 각 피처(단어)의 이름 목록(tfidf_vect가 갖고있음)
  - cluster_num: 군집 개수
  - top_n: 보고싶은 핵심단어 상위 n개

In [23]:
def get_cluster_details(model, cluster_data, feature_names, cluster_num, top_n=10):
    # 모델이 갖고있는 피처의 각 군집의 중심점으로부터 상대 근접 거리를 내림차순으로 정렬
    ordered_idx = model.cluster_centers_.argsort()[:,::-1]  

    # 결과 포맷: {0:{'cluster':0, 'top_features':[...], 'file_names':xxx}, 1:{..}, ...}
    cluster_details = {}
    for cluster_num in range(cluster_num):
        cluster_details[cluster_num] = {}  # cluster details의 key는 군집 번호, 그 key의 value값이 다시 딕셔너리로 
        cluster_details[cluster_num]['cluster'] = cluster_num
        top_feature_index = ordered_idx[cluster_num, :top_n] # 상위 단어의 인덱스가져오기, 내림차순된 인덱스에서 해당하는 클러스터 번호의, 상위 n개 만큼 가져오기
        top_features = [feature_names[idx] for idx in top_feature_index] # 추출한 상위 단어의 인덱스를 통해 벡터화된 각 피처의 이름 목록에서 찾아옴
        cluster_details[cluster_num]['top_features'] = top_features

    return cluster_details
    

In [24]:
feature_names = tfidf_vect.get_feature_names_out()
print(feature_names.shape)
print(feature_names)

(4611,)
['0 5' '0 great' '0 room' ... 'zoom' '\x96' '£6']


In [26]:
cluster_details = get_cluster_details(km_cluster, document_df, feature_names, 3, 10)

for cluster_num, cluster_detail in cluster_details.items():
    print(f'cluster num: {cluster_num}')
    print(f"top features: {cluster_detail['top_features']}")
    print()

cluster num: 0
top features: ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']

cluster num: 1
top features: ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']

cluster num: 2
top features: ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'video', 'size', 'voice']



# 데이터 로딩

In [1]:
import pandas as pd
import glob, os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 700)

path = r'.\dataset\topics'
all_files = glob.glob(os.path.join(path, '*.data'))  # 해당 경로(특정 디렉토리)에 있는 *.data 패턴을 가진 파일을 검색할 때 사용
filename_list = []
opinion_text = []

for file in all_files:
    df = pd.read_table(file, encoding='latin1')   # tab으로 구분된 데이터 읽어옴

    filename = file.split('\\')[-1]  # \로 구분된 마지막 값이 파일명, 경로를 제외한 파일명 추출
    filename = filename.split('.')[0]  # 확장자를 제외한 파일명 추출

    filename_list.append(filename)   # 파일명 리스트에 저장
    opinion_text.append(df.to_string())  # 후기에 대한 텍스트를 문자열로 바꿔서 리스트에 저장

document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ..."
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...
3,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ..."
4,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One..."


# 어근 변환 함수 선언

In [2]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
from nltk.stem import WordNetLemmatizer
import nltk
import string   # 구두점 정보

# 문장에 포함된 구두점을 삭제하기 위한 매핑 정보 생성
# ord(): 매개변수로 전달된 문자에 해당하는 유니코드를 반환 <--> char(): 매개변수에 유니코드를 넣어주면 해당하는 문자를 반환
remove_punct_dict = {ord(punct):None for punct in string.punctuation} # {원본 문자 유니코드:변경 문자 유니코드}, 변경문자유니코드->none으로 넣어 삭제

lemmar = WordNetLemmatizer()

# 입력으로 문장을 받아서 stop words 제거 -> 소문자로 변환 -> 단어로 토큰화 -> 어근 변환
def LemNormalize(text):
    tokens = nltk.word_tokenize(text.lower().translate(remove_punct_dict))
    return [lemmar.lemmatize(token) for token in tokens]

## translate() 실습
- 문자열 내에 특정 문자를 다른 문자로 일괄 변경하는 함수
- 매개변수로 변경하고자 하는 정보를 딕셔너리로 전달
  - {원본 문자 유니코드:변경 문자 유니코드}

In [4]:
# 원래 문자와 바꿀 문자에 대한 맵핑정보(딕셔너리)를 만드는 함수
map = str.maketrans('[],','   ')  # '원본 문자열들', '변경할 문자열들'
print(map)  # 모두 유니코드로 반환
text = '[cat, dog, squirrel, rabbit]'
text = text.translate(map)
print(text)

{91: 32, 93: 32, 44: 32}
 cat  dog  squirrel  rabbit 


# TF-IDF 기반 Vectorization

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tokenizer: 토큰화를 별도의 사용자정의 함수로 지정하는 파라메터
# min_df: 전체 문서에 걸쳐 너무 낮은 빈도수를 가지는 단어는 피처 추출에서 제외시킴
#         실수값으로 지정시, 지정한 백분율 이하 빈도수로 등장하는 단어까지는 추출에서 제외
tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1,2), min_df=0.05, max_df=0.85)
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

In [10]:
feature_vect.shape

(51, 4611)

In [14]:
import numpy as np

df = pd.DataFrame(feature_vect.toarray())
df.head()
df['a'] = np.random.randint(1,10,51)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4602,4603,4604,4605,4606,4607,4608,4609,4610,a
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026967,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,0.0,0.0,0.0,0.008648,0.0,0.0,0.0,0.0,0.017297,0.0,...,0.011735,0.0,0.010701,0.0,0.0,0.0,0.0,0.012075,0.0,8
2,0.0,0.0,0.0,0.0,0.0,0.0,0.021119,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
3,0.0,0.0,0.0,0.012681,0.0,0.0,0.0,0.0,0.012681,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
4,0.058261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


# 군집 분석
- 군집 수를 5개로 지정

In [7]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [8]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",3
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,3
3,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",2
4,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",2


In [9]:
document_df[document_df['cluster_label'] == 0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
50,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
13,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0
14,display_garmin_nuvi_255W_gps,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",0
10,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",0
38,satellite_garmin_nuvi_255W_gps,"It's fast to acquire satellites .\n0 If you've ever had a Brand X GPS take you on some strange route that adds 20 minutes to your trip, has you turn the wrong way down a one way road, tell you to turn AFTER you've passed the street, frequently loses the satellite signal, or has old maps missing streets, you know how important this stuff is .\n1 ...",0
19,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...,0
22,speed_windows7,"Windows 7 is quite simply faster, more stable, boots faster, goes to sleep faster, comes back from sleep faster, manages your files better and on top of that it's beautiful to look at and easy to use .\n0 , faster about 20% to 30% faster at running applications than my Vista , seriously\n1 ...",0
29,updates_garmin_nuvi_255W_gps,"Another thing to consider was that I paid $50 less for the 750 and it came with the FM transmitter cable and a USB cord to connect it to your computer for updates and downloads .\n0 update and reroute much _more_ quickly than my other GPS .\n1 UPDATE ON THIS , It finally turned out that to see the elevation contours at lowe...",0
25,video_ipod_nano_8gb,"I bought the 8, gig Ipod Nano that has the built, in video camera .\n0 Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\n1 ...",0
27,voice_garmin_nuvi_255W_gps,The voice prompts and maps are wonderful especially when driving after dark .\n0 I also thought the the voice prompts of the 750 where more pleasant sounding than the 255w's .\n1 ...,0


In [15]:
document_df[document_df['cluster_label'] == 1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
46,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1
49,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is nice to meet other travellers and great having access to the free Internet access in our room .\n0 They also have a computer available with free internet which is a nice bonus but I didn't find that out till the day before we left but was still able to get on there to check our flight to Vegas the next day .\n1 ...,1
35,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",1
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1
48,room_holiday_inn_london,"We arrived at 23,30 hours and they could not recommend a restaurant so we decided to go to Tesco, with very limited choices but when you are hingry you do not careNext day they rang the bell at 8,00 hours to clean the room, not being very nice being waken up so earlyEvery day they gave u...",1
41,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H...",1
40,rooms_swissotel_chicago,"The Swissotel is one of our favorite hotels in Chicago and the corner rooms have the most fantastic views in the city .\n0 The rooms look like they were just remodled and upgraded, there was an HD TV and a nice iHome docking station to put my iPod so I could set the alarm to wake up with my music instead of the radio .\n1 ...",1
47,staff_bestwestern_hotel_sfo,Staff are friendl...,1
28,staff_swissotel_chicago,"The staff at Swissotel were not particularly nice .\n0 Each time I waited at the counter for staff for several minutes and then was waved to the desk upon my turn with no hello or anything, or apology for waiting in line .\n1 ...",1


# 군집 분석
- 군집 수를 3개로 지정

In [16]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [17]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",1
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,0
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,1
3,price_amazon_kindle,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",2
4,performance_netbook_1005ha,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",2


- 클러스터 0 은 호텔에 대한 리뷰로 군집

In [18]:
document_df[document_df['cluster_label'] == 0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
46,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",0
11,food_holiday_inn_london,The room was packed to capacity with queues at the food buffets .\n0 The over zealous staff cleared our unfinished drinks while we were collecting cooked food and movement around the room with plates was difficult in the crowded circumstances .\n1 ...,0
8,food_swissotel_chicago,The food for our event was delicious .\n0 ...,0
49,free_bestwestern_hotel_sfo,The wine reception is a great idea as it is nice to meet other travellers and great having access to the free Internet access in our room .\n0 They also have a computer available with free internet which is a nice bonus but I didn't find that out till the day before we left but was still able to get on there to check our flight to Vegas the next day .\n1 ...,0
35,location_bestwestern_hotel_sfo,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",0
30,location_holiday_inn_london,"Great location for tube and we crammed in a fair amount of sightseeing in a short time .\n0 All in all, a normal chain hotel on a nice lo...",0
1,parking_bestwestern_hotel_sfo,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,0
5,price_holiday_inn_london,"All in all, a normal chain hotel on a nice location , I will be back if I do not find anthing closer to Picadilly for a better price .\n0 ...",0
48,room_holiday_inn_london,"We arrived at 23,30 hours and they could not recommend a restaurant so we decided to go to Tesco, with very limited choices but when you are hingry you do not careNext day they rang the bell at 8,00 hours to clean the room, not being very nice being waken up so earlyEvery day they gave u...",0
41,rooms_bestwestern_hotel_sfo,"Great Location , Nice Rooms , H...",0


- 클러스터1은 자동차에 대한 리뷰로 군집

In [19]:
document_df[document_df['cluster_label'] == 1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
16,comfort_honda_accord_2008,"Drivers seat not comfortable, the car itself compared to other models of similar class .\n0 ...",1
15,comfort_toyota_camry_2007,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",1
32,gas_mileage_toyota_camry_2007,Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 ...,1
34,interior_honda_accord_2008,I love the new body style and the interior is a simple pleasure except for the center dash .\n0 ...,1
33,interior_toyota_camry_2007,"First of all, the interior has way too many cheap plastic parts like the cheap plastic center piece that houses the clock .\n0 3 blown struts at 30,000 miles, interior trim coming loose and rattling squeaking, stains on paint, and bug splats taking paint off, premature uneven brake wear, on 3rd windsh...",1
7,mileage_honda_accord_2008,"It's quiet, get good gas mileage and looks clean inside and out .\n0 The mileage is great, and I've had to get used to stopping less for gas .\n1 Thought gas ...",1
0,performance_honda_accord_2008,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",1
2,quality_toyota_camry_2007,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,1
24,seats_honda_accord_2008,"Front seats are very uncomfortable .\n0 No memory seats, no trip computer, can only display outside temp with trip odometer .\n1 ...",1
26,transmission_toyota_camry_2007,"After slowing down, transmission has to be kicked to speed up .\n0 ...",1


- 클러스터2는 전자기기에 대한 리뷰로 구성

In [20]:
document_df[document_df['cluster_label'] == 2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
50,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",2
44,battery-life_amazon_kindle,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",2
45,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,2
43,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",2
42,buttons_amazon_kindle,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",2
13,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,2
14,display_garmin_nuvi_255W_gps,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",2
12,eyesight-issues_amazon_kindle,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",2
10,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",2
9,fonts_amazon_kindle,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",2


# 군집별 핵심 단어 추출하기
- cluster_centers_: 피처(단어)가 군집의 중심으로부터 얼마나 근접한지를 군집 내의 다른 피처들과의 상대 위치로 나타낸 값
- 0~1 사이의 값으로 1에 가까울수록 중심과 가까운 것을 의미

In [22]:
cluster_centers = km_cluster.cluster_centers_
# (3,4611): 군집이 3개, 단어가 4611개로 구성
print(cluster_centers.shape)
print(cluster_centers) # 4611개 단어가 각 군집으로부터 얼마나 멀리 떨어져 있는지

(3, 4611)
[[0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]]


## 군집별 핵심 단어 추출 함수 정의

- 매개변수 목록
  - model: KMeans 모델 객체
  - cluster_data: 리뷰 문서 데이터 및 군집 결과 값을 가지고 있는 데이터프레임 객체
  - feature_names: 피처 카운트 벡터화된 각 피처(단어)의 이름 목록(tfidf_vect가 갖고있음)
  - cluster_num: 군집 개수
  - top_n: 보고싶은 핵심단어 상위 n개

In [23]:
def get_cluster_details(model, cluster_data, feature_names, cluster_num, top_n=10):
    # 모델이 갖고있는 피처의 각 군집의 중심점으로부터 상대 근접 거리를 내림차순으로 정렬
    ordered_idx = model.cluster_centers_.argsort()[:,::-1]  

    # 결과 포맷: {0:{'cluster':0, 'top_features':[...], 'file_names':xxx}, 1:{..}, ...}
    cluster_details = {}
    for cluster_num in range(cluster_num):
        cluster_details[cluster_num] = {}  # cluster details의 key는 군집 번호, 그 key의 value값이 다시 딕셔너리로 
        cluster_details[cluster_num]['cluster'] = cluster_num
        top_feature_index = ordered_idx[cluster_num, :top_n] # 상위 단어의 인덱스가져오기, 내림차순된 인덱스에서 해당하는 클러스터 번호의, 상위 n개 만큼 가져오기
        top_features = [feature_names[idx] for idx in top_feature_index] # 추출한 상위 단어의 인덱스를 통해 벡터화된 각 피처의 이름 목록에서 찾아옴
        cluster_details[cluster_num]['top_features'] = top_features

    return cluster_details
    

In [24]:
feature_names = tfidf_vect.get_feature_names_out()
print(feature_names.shape)
print(feature_names)

(4611,)
['0 5' '0 great' '0 room' ... 'zoom' '\x96' '£6']


In [26]:
cluster_details = get_cluster_details(km_cluster, document_df, feature_names, 3, 10)

for cluster_num, cluster_detail in cluster_details.items():
    print(f'cluster num: {cluster_num}')
    print(f"top features: {cluster_detail['top_features']}")
    print()

cluster num: 0
top features: ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']

cluster num: 1
top features: ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']

cluster num: 2
top features: ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'video', 'size', 'voice']

