In [1]:
import pandas as pd

import glob, os

In [19]:
path = './topics'

In [20]:
all_files = glob.glob(os.path.join(path, '*.data'))

In [37]:
file_names = []
opinions = []

In [38]:
for file in all_files :
    
    df = pd.read_table(file, index_col = None, header = 0, encoding = 'latin1')
    
    file_name = file.split('\\')[-1].split('.')[1].split('/')[-1]
    
    file_names.append(file_name)
    opinions.append(df.to_string())

In [66]:
doc_data = pd.DataFrame({'filename' : file_names, 'opnion_text' : opinions})

In [41]:
doc_data.head()

Unnamed: 0,filename,opnion_text
0,battery-life_ipod_nano_8gb,...
1,gas_mileage_toyota_camry_2007,...
2,room_holiday_inn_london,...
3,location_holiday_inn_london,...
4,staff_bestwestern_hotel_sfo,...


In [43]:
doc_data.shape

(51, 2)

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
tfidf = TfidfVectorizer(lowercase = True, ngram_range = (1, 2), min_df = 0.05, max_df = 0.9, stop_words = 'english')

In [54]:
features = tfidf.fit_transform(doc_data['opnion_text'])

In [55]:
from sklearn.cluster import KMeans

In [56]:
km = KMeans(n_clusters = 5, max_iter = 15000, random_state = 42)

In [57]:
km.fit(features)

KMeans(max_iter=15000, n_clusters=5, random_state=42)

In [58]:
cls = km.labels_

In [59]:
cls_center = km.cluster_centers_

In [67]:
doc_data['cluster'] = cls

In [68]:
doc_data.head()

Unnamed: 0,filename,opnion_text,cluster
0,battery-life_ipod_nano_8gb,...,1
1,gas_mileage_toyota_camry_2007,...,2
2,room_holiday_inn_london,...,0
3,location_holiday_inn_london,...,4
4,staff_bestwestern_hotel_sfo,...,4


In [70]:
doc_data[doc_data.cluster == 0]

Unnamed: 0,filename,opnion_text,cluster
2,room_holiday_inn_london,...,0
30,rooms_swissotel_chicago,...,0
31,bathroom_bestwestern_hotel_sfo,...,0
46,rooms_bestwestern_hotel_sfo,...,0


In [71]:
doc_data[doc_data.cluster == 1]

Unnamed: 0,filename,opnion_text,cluster
0,battery-life_ipod_nano_8gb,...,1
9,battery-life_amazon_kindle,...,1
11,battery-life_netbook_1005ha,...,1
15,performance_netbook_1005ha,...,1
24,sound_ipod_nano_8gb,headphone jack i got a clear case for it a...,1


In [76]:
km = KMeans(n_clusters = 3, max_iter = 15000, random_state = 42)

In [77]:
km.fit(features)

KMeans(max_iter=15000, n_clusters=3, random_state=42)

In [78]:
cls = km.labels_

In [79]:
cls_center = km.cluster_centers_

In [80]:
doc_data['cluster'] = cls

In [81]:
doc_data.head()

Unnamed: 0,filename,opnion_text,cluster
0,battery-life_ipod_nano_8gb,...,0
1,gas_mileage_toyota_camry_2007,...,1
2,room_holiday_inn_london,...,2
3,location_holiday_inn_london,...,2
4,staff_bestwestern_hotel_sfo,...,2


In [83]:
doc_data[doc_data.cluster == 0].sample(10)

Unnamed: 0,filename,opnion_text,cluster
37,screen_netbook_1005ha,...,0
38,navigation_amazon_kindle,...,0
10,satellite_garmin_nuvi_255W_gps,...,0
40,speed_windows7,...,0
19,updates_garmin_nuvi_255W_gps,...,0
36,eyesight-issues_amazon_kindle,...,0
7,size_asus_netbook_1005ha,...,0
6,speed_garmin_nuvi_255W_gps,...,0
8,screen_garmin_nuvi_255W_gps,...,0
41,price_amazon_kindle,...,0


In [84]:
doc_data[doc_data.cluster == 1].sample(10)

Unnamed: 0,filename,opnion_text,cluster
18,comfort_honda_accord_2008,...,1
45,interior_honda_accord_2008,...,1
35,mileage_honda_accord_2008,...,1
22,interior_toyota_camry_2007,...,1
42,quality_toyota_camry_2007,...,1
43,comfort_toyota_camry_2007,...,1
1,gas_mileage_toyota_camry_2007,...,1
23,transmission_toyota_camry_2007,...,1
47,performance_honda_accord_2008,...,1
29,seats_honda_accord_2008,...,1


In [85]:
doc_data[doc_data.cluster == 2].sample(10)

Unnamed: 0,filename,opnion_text,cluster
17,food_holiday_inn_london,...,2
3,location_holiday_inn_london,...,2
50,parking_bestwestern_hotel_sfo,...,2
32,food_swissotel_chicago,...,2
31,bathroom_bestwestern_hotel_sfo,...,2
13,service_swissotel_hotel_chicago,...,2
30,rooms_swissotel_chicago,...,2
27,service_holiday_inn_london,...,2
2,room_holiday_inn_london,...,2
28,price_holiday_inn_london,...,2


In [87]:
km.cluster_centers_.shape

(3, 4403)

In [102]:
word_result = pd.DataFrame(km.cluster_centers_, columns = tfidf.get_feature_names())

In [103]:
word_result

Unnamed: 0,00,000,000 miles,05,06,07,08,10 did,10 great,10 inch,...,year,year old,years,years ago,years old,yellow,yes,yes rooms,yields,zoom
0,0.001304,0.0,0.0,0.0,0.0,0.000551,0.0,0.0,0.0,0.003177,...,0.005708,0.003491,0.00524,0.00244,0.001773,0.0,0.002975,0.0,0.000411,0.007338
1,0.000931,0.007159,0.004859,0.003369,0.003248,0.004268,0.007,0.0,0.0,0.0,...,0.004121,0.002509,0.005442,0.0,0.0,0.002379,0.001886,0.0,0.002923,0.0
2,0.004466,0.0,0.0,0.0,0.000195,0.0,0.0,0.000857,0.001729,0.0,...,0.001365,0.001024,0.002556,0.000864,0.0007,0.000951,0.003261,0.001657,0.0,0.0


In [124]:
for n in range(3) :
    top10_idx = km.cluster_centers_[n].argsort()[::-1][:10]
    top10_features = [tfidf.get_feature_names()[ind] for ind in top10_idx]
    
    filenames = doc_data[doc_data.cluster == n].filename.tolist()
    
    print(f'### Cluster{n} ###\n')
    print(f'Top 10 Words = {top10_features}\n')
    print(f'Review File : {filenames[:5]}\n\n')

### Cluster0 ###

Top 10 Words = ['screen', 'battery', 'battery life', 'keyboard', 'kindle', 'life', 'directions', 'size', 'voice', 'speed']

Review File : ['battery-life_ipod_nano_8gb', 'voice_garmin_nuvi_255W_gps', 'speed_garmin_nuvi_255W_gps', 'size_asus_netbook_1005ha', 'screen_garmin_nuvi_255W_gps']


### Cluster1 ###

Top 10 Words = ['interior', 'mileage', 'seats', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']

Review File : ['gas_mileage_toyota_camry_2007', 'comfort_honda_accord_2008', 'interior_toyota_camry_2007', 'transmission_toyota_camry_2007', 'seats_honda_accord_2008']


### Cluster2 ###

Top 10 Words = ['hotel', 'service', 'rooms', 'staff', 'room', 'food', 'location', 'clean', 'bathroom', 'parking']

Review File : ['room_holiday_inn_london', 'location_holiday_inn_london', 'staff_bestwestern_hotel_sfo', 'service_swissotel_hotel_chicago', 'service_bestwestern_hotel_sfo']


