# 라이브러리 및 데이터 로드

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import glob, os

In [2]:
path = '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/'

In [8]:
all_files = glob.glob(os.path.join(path, "*.data"))
print(all_files)

['/content/drive/MyDrive/data/OpinosisDataset1.0/topics/satellite_garmin_nuvi_255W_gps.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/room_holiday_inn_london.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/rooms_swissotel_chicago.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/rooms_bestwestern_hotel_sfo.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/quality_toyota_camry_2007.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/price_holiday_inn_london.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/performance_netbook_1005ha.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/price_amazon_kindle.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/performance_honda_accord_2008.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/parking_bestwestern_hotel_sfo.txt.data', '/content/drive/MyDrive/data/OpinosisDataset1.0/topics/navigation_amazon_k

In [19]:
filename_list = []
opinion_text = []

for file_ in all_files:
  df = pd.read_table(file_, index_col=None, header=0, encoding='latin1')

  # file 명 가공
  filename_ = file_.split('/')[-1]
  filename = filename_.split('.')[0]

  # 파일명 리스트
  filename_list.append(filename)
  # 파일 내용 리스트
  opinion_text.append(df.to_string())

# 파일명 리스트, 파일내용 리스트 DataFrame화
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,satellite_garmin_nuvi_255W_gps,...
1,room_holiday_inn_london,...
2,rooms_swissotel_chicago,...
3,rooms_bestwestern_hotel_sfo,...
4,quality_toyota_camry_2007,...


### Lemmatization(표제어 추출) 함수 

In [20]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

In [21]:
# 구두점 제거 
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

# 토큰들에 대해 Lemmatize 수행
def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

### TF-IDF 피처 벡터화, TfidfVectorizer에서 피처 벡터화 수행 시 Lemmatization을 적용하여 토큰화

In [27]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english',
                            ngram_range=(1,2), min_df=0.05, max_df=0.85)

feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

  'stop_words.' % sorted(inconsistent))


### K-Means 5개 군집화

In [30]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)

cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

### 군집화된 그룹별 데이터 확인

In [31]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,satellite_garmin_nuvi_255W_gps,...,3
1,room_holiday_inn_london,...,1
2,rooms_swissotel_chicago,...,1
3,rooms_bestwestern_hotel_sfo,...,1
4,quality_toyota_camry_2007,...,0


In [32]:
# cluster label 이 0인 것
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
28,comfort_honda_accord_2008,...,0
26,comfort_toyota_camry_2007,...,0
17,gas_mileage_toyota_camry_2007,...,0
16,interior_honda_accord_2008,...,0
15,interior_toyota_camry_2007,...,0
11,mileage_honda_accord_2008,...,0
8,performance_honda_accord_2008,...,0
4,quality_toyota_camry_2007,...,0
50,seats_honda_accord_2008,...,0
37,transmission_toyota_camry_2007,...,0


In [33]:
# cluster label 이 1인 것
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
31,bathroom_bestwestern_hotel_sfo,...,1
1,room_holiday_inn_london,...,1
3,rooms_bestwestern_hotel_sfo,...,1
2,rooms_swissotel_chicago,...,1


### KMeans 3개 군집화

In [35]:
from sklearn.cluster import KMeans

km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_

In [36]:
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
9,parking_bestwestern_hotel_sfo,...,0
20,food_holiday_inn_london,...,0
21,free_bestwestern_hotel_sfo,...,0
49,service_bestwestern_hotel_sfo,...,0
45,service_holiday_inn_london,...,0
13,location_holiday_inn_london,...,0
12,location_bestwestern_hotel_sfo,...,0
44,service_swissotel_hotel_chicago,...,0
19,food_swissotel_chicago,...,0
38,staff_swissotel_chicago,...,0


### 군집(Cluster)별 핵심 단어 추출하기

In [37]:
feature_vect.shape

(51, 4611)

In [38]:
cluster_centers = km_cluster.cluster_centers_
print(cluster_centers.shape)
print(cluster_centers)

(3, 4611)
[[0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.01005322 0.         0.         ... 0.00706287 0.         0.        ]]


정규화 된 상태여서 0~1 값으로 1에 가까울 수록 중심에 가깝다

### 군집별 top n 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명들을 반환하는 함수 생성

In [54]:
# 군집별 top n 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명들을 반환 
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=10):
    cluster_details = {}
    
    # cluster_centers array 의 값이 큰 순으로 정렬된 index 값을 반환
    centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:,::-1]
    
    #개별 군집별 핵심단어, 그 단어의 중심 위치 상대값, 대상 파일명 입력
    for cluster_num in range(clusters_num):
        # 개별 군집별 정보를 담을 데이터 초기화
        cluster_details[cluster_num] = {}
        cluster_details[cluster_num]['cluster'] = cluster_num
        
        # cluster_centers_.argsort()[:,::-1] 로 구한 index 를 이용하여 top n 피처 단어를 구함. 
        top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
        top_features = [ feature_names[ind] for ind in top_feature_indexes ]
        
        # top_feature_indexes 중심 위치 상댓값 
        top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()
        
        # cluster_details 딕셔너리 객체에 개별 군집별 핵심 단어와 중심위치 상대값, 파일명 입력
        cluster_details[cluster_num]['top_features'] = top_features
        cluster_details[cluster_num]['top_features_value'] = top_feature_values
        filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
        filenames = filenames.values.tolist()
        cluster_details[cluster_num]['filenames'] = filenames
        
    return cluster_details

### 클러스터별 top feature들의 단어와 파일명 출력

In [58]:
def print_cluster_details(cluster_details):
    for cluster_num, cluster_detail in cluster_details.items():
        print('### Cluster {0}'.format(cluster_num))
        print('Top features:', cluster_detail['top_features'])
        print('Reviews 파일명 :',cluster_detail['filenames'][:7])
        print('\n')

In [59]:
feature_names = tfidf_vect.get_feature_names()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df,
                                  feature_names=feature_names, clusters_num=3, top_n_features=10 )
print_cluster_details(cluster_details)

### Cluster 0
Top features: ['room', 'hotel', 'service', 'staff', 'food', 'location', 'bathroom', 'clean', 'price', 'parking']
Reviews 파일명 : ['room_holiday_inn_london', 'rooms_swissotel_chicago', 'rooms_bestwestern_hotel_sfo', 'price_holiday_inn_london', 'parking_bestwestern_hotel_sfo', 'location_bestwestern_hotel_sfo', 'location_holiday_inn_london']


### Cluster 1
Top features: ['interior', 'seat', 'mileage', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']
Reviews 파일명 : ['quality_toyota_camry_2007', 'performance_honda_accord_2008', 'mileage_honda_accord_2008', 'interior_toyota_camry_2007', 'interior_honda_accord_2008', 'gas_mileage_toyota_camry_2007', 'comfort_toyota_camry_2007']


### Cluster 2
Top features: ['screen', 'battery', 'keyboard', 'battery life', 'life', 'kindle', 'direction', 'video', 'size', 'voice']
Reviews 파일명 : ['satellite_garmin_nuvi_255W_gps', 'performance_netbook_1005ha', 'price_amazon_kindle', 'navigation_amazon_kindle', 'key