## 07 문서 군집화 소개와 실습

## 문서 군집화 개념
- 비슷한 텍스트 구성의 문서를 군집화하는 것
- 동일한 군집에 속하는 문서를 같은 카테고리 소속으로 분류 가능

## Opinion Review 데이터 세트를 이용한 문서 군집화 수행


In [3]:
import pandas as pd
import glob, os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 700)

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
path = r'/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1.0/topics'
# path로 지정한 디렉터리 밑에 있는 모든 .data파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, '*data'))
filename_list = []
opinion_text = []

# 개별 파일들의 파일명은 filename_list 리스트로 취합
# 개별 파일들의 파일 내용은 DataFrame 로딩 후 다시 string 으로 변화하여 opinion_text 리스트로 취합
for file_ in all_files:
  # 개별 파일을 읽어서 DataFrame으로 생성
  df = pd.read_table(file_, index_col=None, header=0, encoding='latin1')
  # 절대 경로로 주어진 파일명을 가공, Linux에서 수행 시에는 아래 \\를 /로 변경
  # 맨 마지막 .data 확장자도 제거
  filename_ = file_.split('\\')[-1]
  filename_ = filename_.split('.')[0]
  # 파일명 list와 파일 내용 list에 파일명과 파일 내용을 추가
  filename_list.append(filename_)
  opinion_text.append(df.to_string())

# 파일명 리스트와 파일 내용 리스트를 DataFrame으로 생성
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi..."
1,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ..."
2,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now..."
3,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...
4,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ..."


In [9]:
# TfidfVectorizer의 fit_transform()의 인자로 document_df DataFrame의 opinion_text 칼럼을 입력해 개별 문서 텍스트에 대해
# TF-IDF 변환된 피처 벡터화된 행렬을 구하기
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

def LemNormalize(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    return [wordnet_lemmatizer.lemmatize(token) for token in text.split()]

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1,2), min_df=0.05, max_df=0.85)

# opinion_text 칼럼값으로 feature vectorization 수행
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [11]:
# KMeans를 수행한 후에 군집의 Label 값과 중심별로 할당된 데이터 세트의 좌표 값을 구함.
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예재를 위해 동일한 클러스터링 결과 도출용 random_state=0
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [12]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",4
1,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",1
2,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",2
3,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,2
4,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"The room was not overly big, but clean and very comfortable beds, a great shower and very clean bathrooms .\n0 The second room was smaller, with a very inconvenient bathroom layout, but at least it was quieter and we were able to sleep .\n1 ...",1


In [13]:
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
7,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",0
8,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",0
12,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",0
25,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,0
27,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"In fact, the entire navigation structure has been completely revised , I'm still getting used to it but it's a huge step forward .\n0 ...",0
28,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",0
50,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Windows 7 is quite simply faster, more stable, boots faster, goes to sleep faster, comes back from sleep faster, manages your files better and on top of that it's beautiful to look at and easy to use .\n0 , faster about 20% to 30% faster at running applications than my Vista , seriously\n1 ...",0


In [15]:
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
1,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",1
38,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Front seats are very uncomfortable .\n0 No memory seats, no trip computer, can only display outside temp with trip odometer .\n1 ...",1
37,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Mediocre room and service for a very extravagant price .\n0 ...,1
35,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"We arrived at 23,30 hours and they could not recommend a restaurant so we decided to go to Tesco, with very limited choices but when you are hingry you do not careNext day they rang the bell at 8,00 hours to clean the room, not being very nice being waken up so earlyEvery day they gave u...",1
34,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"The Swissotel is one of our favorite hotels in Chicago and the corner rooms have the most fantastic views in the city .\n0 The rooms look like they were just remodled and upgraded, there was an HD TV and a nice iHome docking station to put my iPod so I could set the alarm to wake up with my music instead of the radio .\n1 ...",1
31,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Great Location , Nice Rooms , H...",1
26,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"All in all, a normal chain hotel on a nice location , I will be back if I do not find anthing closer to Picadilly for a better price .\n0 ...",1
24,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1
40,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Both of us having worked in tourism for over 14 years were very disappointed at the level of service provided by this gentleman .\n0 The service was good, very friendly staff and we loved the free wine reception each night .\n1 ...",1
22,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Good Value good location , ideal choice .\n0 Great Location , Nice Rooms , Helpless Concierge\n1 ...",1


In [14]:
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
2,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",2
3,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,2
6,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"After I plugged it in to my USB hub on my computer to charge the battery the charging cord design is very clever !\n0 After you have paged tru a 500, page book one, page, at, a, time to get from Chapter 2 to Chapter 15, see how excited you are about a low battery and all the time it took to get there !\n1 ...",2
15,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",2
17,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,", I think the new keyboard rivals the great hp mini keyboards .\n0 Since the battery life difference is minimum, the only reason to upgrade would be to get the better keyboard .\n1 The keyboard is now as good as t...",2
23,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"The Eee Super Hybrid Engine utility lets users overclock or underclock their Eee PC's to boost performance or provide better battery life depending on their immediate requirements .\n0 In Super Performance mode CPU, Z shows the bus speed to increase up to 169 .\n1 One...",2
29,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",2
32,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Keep in mind that once you get in a room full of light or step outdoors screen reflections could become annoying .\n0 I've used mine outsi...,2
33,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,It is easy to read and when touching the screen it works great !\n0 and zoom out buttons on the 255w to the same side of the screen which makes it a bit easier .\n1 ...,2
36,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"As always, the video screen is sharp and bright .\n0 2, inch screen and a glossy, polished aluminum finish that one CNET editor described as looking like a Christmas tree ornament .\n1 ...",2


In [16]:
# cluster_label=3인 데이터 세트
document_df[document_df['cluster_label']==3].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
47,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Staff are friendl...,3
49,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"The staff at Swissotel were not particularly nice .\n0 Each time I waited at the counter for staff for several minutes and then was waved to the desk upon my turn with no hello or anything, or apology for waiting in line .\n1 ...",3


In [17]:
document_df[document_df['cluster_label']==4].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",4
13,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,4
14,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"3 quot widescreen display was a bonus .\n0 This made for smoother graphics on the 255w of the vehicle moving along displayed roads, where the 750's display was more of a jerky movement .\n1 ...",4
16,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 ...,4
20,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It's quiet, get good gas mileage and looks clean inside and out .\n0 The mileage is great, and I've had to get used to stopping less for gas .\n1 Thought gas ...",4
30,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It's fast to acquire satellites .\n0 If you've ever had a Brand X GPS take you on some strange route that adds 20 minutes to your trip, has you turn the wrong way down a one way road, tell you to turn AFTER you've passed the street, frequently loses the satellite signal, or has old maps missing streets, you know how important this stuff is .\n1 ...",4
39,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...,4
46,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"After slowing down, transmission has to be kicked to speed up .\n0 ...",4
48,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Another thing to consider was that I paid $50 less for the 750 and it came with the FM transmitter cable and a USB cord to connect it to your computer for updates and downloads .\n0 update and reroute much _more_ quickly than my other GPS .\n1 UPDATE ON THIS , It finally turned out that to see the elevation contours at lowe...",4


In [20]:
# 중심 개수를 5개에서 3개로 낮춰서 3개 그룹으로 군집화한 뒤 결과를 확인
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_

# 소속 클러스터를 cluster_label 칼럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
7,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",0
12,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",0
8,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",0
13,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0
27,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"In fact, the entire navigation structure has been completely revised , I'm still getting used to it but it's a huge step forward .\n0 ...",0
30,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It's fast to acquire satellites .\n0 If you've ever had a Brand X GPS take you on some strange route that adds 20 minutes to your trip, has you turn the wrong way down a one way road, tell you to turn AFTER you've passed the street, frequently loses the satellite signal, or has old maps missing streets, you know how important this stuff is .\n1 ...",0
28,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",0
25,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,0
50,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Windows 7 is quite simply faster, more stable, boots faster, goes to sleep faster, comes back from sleep faster, manages your files better and on top of that it's beautiful to look at and easy to use .\n0 , faster about 20% to 30% faster at running applications than my Vista , seriously\n1 ...",0


In [22]:
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
0,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
7,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"I thought it would be fitting to christen my Kindle with the Stephen King novella UR, so went to the Amazon site on my computer and clicked on the button to buy it .\n0 As soon as I'd clicked the button to confirm my order it appeared on my Kindle almost immediately !\n1 ...",0
8,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Being able to change the font sizes is awesome !\n0 For whatever reason, Amazon decided to make the Font on the Home Screen ...",0
12,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It feels as easy to read as the K1 but doesn't seem any crisper to my eyes .\n0 the white is really GREY, and to avoid considerable eye, strain I had to refresh pages every other page .\n1 The dream has always been a portable electronic device that could hold a ton of reading material, automate subscriptions and fa...",0
13,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0
25,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,I previously owned a Toyota 4Runner which had incredible build quality and reliability .\n0 I bought the Camry because of Toyota reliability and qua...,0
27,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"In fact, the entire navigation structure has been completely revised , I'm still getting used to it but it's a huge step forward .\n0 ...",0
28,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"If a case was included, as with the Kindle 1, that would have been reflected in a higher price .\n0 lower overall price, with nice leather cover .\n1 ...",0
30,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"It's fast to acquire satellites .\n0 If you've ever had a Brand X GPS take you on some strange route that adds 20 minutes to your trip, has you turn the wrong way down a one way road, tell you to turn AFTER you've passed the street, frequently loses the satellite signal, or has old maps missing streets, you know how important this stuff is .\n1 ...",0
50,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Windows 7 is quite simply faster, more stable, boots faster, goes to sleep faster, comes back from sleep faster, manages your files better and on top of that it's beautiful to look at and easy to use .\n0 , faster about 20% to 30% faster at running applications than my Vista , seriously\n1 ...",0


In [23]:
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
1,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Ride seems comfortable and gas mileage fairly good averaging 26 city and 30 open road .\n0 Seats are fine, in fact of all the smaller sedans this is the most comfortable I found for the price as I am 6', 2 and 250# .\n1 Great gas mileage and comfortable on long trips ...",1
43,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"not customer, oriented hotelvery low service levelboor reception\n0 The room was quiet, clean, the bed and pillows were comfortable, and the serv...",1
40,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Both of us having worked in tourism for over 14 years were very disappointed at the level of service provided by this gentleman .\n0 The service was good, very friendly staff and we loved the free wine reception each night .\n1 ...",1
38,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Front seats are very uncomfortable .\n0 No memory seats, no trip computer, can only display outside temp with trip odometer .\n1 ...",1
37,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Mediocre room and service for a very extravagant price .\n0 ...,1
35,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"We arrived at 23,30 hours and they could not recommend a restaurant so we decided to go to Tesco, with very limited choices but when you are hingry you do not careNext day they rang the bell at 8,00 hours to clean the room, not being very nice being waken up so earlyEvery day they gave u...",1
34,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"The Swissotel is one of our favorite hotels in Chicago and the corner rooms have the most fantastic views in the city .\n0 The rooms look like they were just remodled and upgraded, there was an HD TV and a nice iHome docking station to put my iPod so I could set the alarm to wake up with my music instead of the radio .\n1 ...",1
31,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Great Location , Nice Rooms , H...",1
26,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"All in all, a normal chain hotel on a nice location , I will be back if I do not find anthing closer to Picadilly for a better price .\n0 ...",1
24,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Parking was expensive but I think this is common for San Fran .\n0 there is a fee for parking but well worth it seeing no where to park if you do have a car .\n1 ...,1


In [24]:
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
2,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh Li, ion Battery , and a 1 .\n0 Not to mention that as of now...",2
45,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"I bought the 8, gig Ipod Nano that has the built, in video camera .\n0 Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\n1 ...",2
44,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,The voice prompts and maps are wonderful especially when driving after dark .\n0 I also thought the the voice prompts of the 750 where more pleasant sounding than the 255w's .\n1 ...,2
42,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,headphone jack i got a clear case for it and it i got a clear case for it and it like prvents me from being able to put the jack all the way in so the sound can b messsed up or i can get it in there and its playing well them go to move or something and it slides out .\n0 Picture and sound quality are excellent for this typ of devic .\n1 ...,2
41,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"A few other things I'd like to point out is that you must push the micro, sized right angle end of the ac adapter until it snaps in place or the battery may not charge .\n0 The full size right shift k...",2
39,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...,2
36,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"As always, the video screen is sharp and bright .\n0 2, inch screen and a glossy, polished aluminum finish that one CNET editor described as looking like a Christmas tree ornament .\n1 ...",2
33,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,It is easy to read and when touching the screen it works great !\n0 and zoom out buttons on the 255w to the same side of the screen which makes it a bit easier .\n1 ...,2
32,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,Keep in mind that once you get in a room full of light or step outdoors screen reflections could become annoying .\n0 I've used mine outsi...,2
29,/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1,"Very happy with my 08 Accord, performance is quite adequate it has nice looks and is a great long, distance cruiser .\n0 6, 4, 3 eco engine has poor performance and gas mileage of 22 highway .\n1 Overall performance is good but comfort level is poor .\n2 ...",2


## 군집별 핵심 단어 추출
- KMeans 객체는 각 군집을 구성하는 단어 피처가 군집의 중심을 기준으로 얼마나 가깝게 위치하는지 cluster_centers_라는 속성으로 제공
- cluster_centers_: 배열 값으로 제공되며, 행은 개별 군집을 열은 개별 피처를 의미


In [21]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape :', cluster_centers.shape)
print(cluster_centers)

cluster_centers shape : (3, 5648)
[[0.00145083 0.00181989 0.         ... 0.         0.         0.        ]
 [0.         0.00030573 0.00069024 ... 0.00137043 0.00109049 0.00109049]
 [0.00157085 0.00074717 0.         ... 0.         0.         0.        ]]


In [25]:
# cluster_centers_ 속성은 넘파이의 ndarray로 ndarray의 argsort()[:,::-1]를 이용해 cluster_centers 배열 내 값이
# 큰 순으로 정렬된 위치 인덱스 값을 반환
# get_cluster_details() 함수의 주요 로직의 처리 진행
# 군집별 top n 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명을 반환
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}

    # cluster.centers array의 값이 큰 순으로 정렬된 인덱스 값을 반환
    # 군집 중심점(centroid)별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:, ::-1]

    # 개별 군집별로 반복하면서 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명 입력
    for cluster_num in range(clusters_num):
        # 개별 군집별 정보를 담을 데이터 초기화
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster']=cluster_num

        # cluster_centers_.argsort()[:,::-1]로 구한 인덱스를 이용해 top n 피처 단어를 구함
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [feature_names[ind] for ind in top_feature_indexes]

        # top_feature_indexes를 이용해 해당 피처 단어의 중심 위치 상댓값 구함
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()

        # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심 위치 상댓값, 해당 파일명 입력
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
        filenames = filenames.values.tolist()

        cluster_details[cluster_num]['filenames'] = filenames

    return cluster_details


In [26]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('####### Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
        print('Reviews 파일명 :',cluster_detail['filenames'][:7])

In [27]:
feature_names = tfidf_vect.get_feature_names_out()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df, feature_names=feature_names, clusters_num=3, top_n_features=10)
print_cluster_details(cluster_details)

####### Cluster 0
Top features: ['kindle', 'direction', 'font', 'button', 'page', 'satellite', 'accurate', 'book', 'eye', 'faster']
Reviews 파일명 : ['/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1']
####### Cluster 1
Top features: ['room', 'hotel', 'service', 'staff', 'interior', 'food', 'location', 'seat', 'mileage', 'comfortable']
Reviews 파일명 : ['/content/drive/MyDrive/ESAA/data/opinosis_opinion_frasl_review/OpinosisDataset1', '/content/drive/MyDrive/ESAA/data/opin

- Cluster #0: 'screen', 'battery', 'battery life' 등과 같은 화면과 배터리 수명 등이 핵심단어 군집화, 모바일형 엔터테이먼트용 전자제품의 경우 화면 크기와 배터리 수명이 주요 관심사임을 확인 가능
- Cluster #1: 'interior', 'seat', 'mileage', 'comfortable' 등과 같은 실내 인테리어, 좌석, 연료 효율 등이 핵심단어로 군집화
- Cluster #2: 'room', 'hotel', 'service', 'staff' 등과 같은 방과 서비스 등이 핵심 단어로 군집화