In [1]:
# 구글 드라이브 마운트 명령어
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import pandas as pd
import glob, os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 700)

path = r'/content/drive/MyDrive/hs/2. ExampleData/dm_uci_opinion_data/data/OpinosisDataset1.0/topics'

# path로 지정한 디렉터리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, "*.data"))

filename_list = []
opinion_text = []

# 개별 파일들의 파일명은 filename_list 리스트로 취합,
# 개별 파일들의 파일 내용은 DataFrame 로딩 후 다시 string으로 변환하여 opinion_text 리스트로 취합
for file_ in all_files:
    # 개별 파일을 읽어서 DataFrame으로 생성
    df = pd.read_table(file_,index_col=None, header=0,encoding='latin1')

    # 절대경로로 주어진 file 명을 가공. 만일 Linux에서 수행시에는 아래 \\를 / 변경.
    # 맨 마지막 .data 확장자도 제거
    filename_ = file_.split('/')[-1]
    filename = filename_.split('.')[0]

    # 파일명 리스트와 파일 내용 리스트에 파일명과 파일 내용을 추가.
    filename_list.append(filename)
    opinion_text.append(df.to_string())

# 파일명 리스트와 파일 내용 리스트를  DataFrame으로 생성
document_df = pd.DataFrame({'filename':filename_list, 'opinion_text':opinion_text})
document_df.head()

Unnamed: 0,filename,opinion_text
0,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...
1,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...
2,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ..."
3,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi..."
4,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...


- 문서를 TF-IDF 형태로 피처 벡터화 수행
- 토근화는 Lemmarization을 구현하여 진행

In [18]:
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import string

# ord() : 하나의 문자를 인자로 받고 해당 문자에 해당하는 유니코드를 반환.
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

# 입력으로 들어온 token단어들에 대해서 lemmatization 어근 변환.
def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

# TfidfVectorizer 객체 생성 시 tokenizer인자로 해당 함수를 설정하여 lemmatization 적용
# 입력으로 문장을 받아서 stop words 제거-> 소문자 변환 -> 단어 토큰화 -> lemmatization 어근 변환.
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect =TfidfVectorizer(tokenizer=LemNormalize, stop_words='english',ngram_range=(1,2), min_df=0.05, max_df=0.85)

feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])

In [23]:
# ndarray에서 행렬의 요소를 알고 싶으면, print()를 이용
print(feature_vect)

  (0, 1994)	0.013742871303945654
  (0, 4094)	0.015506530926591771
  (0, 1949)	0.015506530926591771
  (0, 4385)	0.008091354338174084
  (0, 2326)	0.013742871303945654
  (0, 4397)	0.00947653884856341
  (0, 2588)	0.015506530926591771
  (0, 3026)	0.013072359034509491
  (0, 1468)	0.013072359034509491
  (0, 4563)	0.013072359034509491
  (0, 4308)	0.014535918995658993
  (0, 3242)	0.015506530926591771
  (0, 3828)	0.014535918995658993
  (0, 1324)	0.015506530926591771
  (0, 3777)	0.015506530926591771
  (0, 887)	0.015506530926591771
  (0, 2447)	0.013072359034509491
  (0, 2497)	0.01249153488757759
  (0, 2450)	0.015506530926591771
  (0, 1907)	0.015506530926591771
  (0, 2594)	0.013742871303945654
  (0, 4257)	0.015506530926591771
  (0, 2562)	0.015506530926591771
  (0, 691)	0.013742871303945654
  (0, 3824)	0.015506530926591771
  :	:
  (50, 942)	0.022116164438457844
  (50, 1127)	0.018590836275207984
  (50, 2291)	0.0039028606190831087
  (50, 4068)	0.006262096951525831
  (50, 1053)	0.005845037085387159
  (

- 문서 군집화 수행

In [25]:
from sklearn.cluster import KMeans

km_cluster=KMeans(n_clusters=5, max_iter=10000, random_state=0)

km_cluster.fit(feature_vect)
cluster_label=km_cluster.labels_
cluster_centers=km_cluster.cluster_centers_

In [27]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\n0 I love this ipod except for the battery life .\n1 ...,1
1,directions_garmin_nuvi_255W_gps,You also get upscale features like spoken directions including street names and programmable POIs .\n0 I used to hesitate to go out of my directions but no...,0
2,features_windows7,"I had to uninstall anti, virus and selected other programs, some of which did not have listings in the Programs and Features Control Panel section .\n0 This review briefly touches upon some of the key features and enhancements of Microsoft's latest OS .\n1 ...",0
3,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n0 but for the most part, we find that the Garmin software provides accurate directions, whereever we intend to go .\n1 This functi...",0
4,speed_garmin_nuvi_255W_gps,Another feature on the 255w is a display of the posted speed limit on the road which you are currently on right above your current displayed speed .\n0 I found myself not even looking at my car speedometer as I could easily see my current speed and the speed limit of my route at a glance .\n1 ...,0


- 군집별 핵심 단어 추출

In [28]:
cluster_centers = km_cluster.cluster_centers_

print('cluster_centers shape :', cluster_centers.shape)
print(cluster_centers)

cluster_centers shape : (5, 4611)
[[0.01251792 0.         0.         ... 0.01368445 0.         0.        ]
 [0.00973837 0.         0.         ... 0.00397274 0.         0.        ]
 [0.         0.00099499 0.00174637 ... 0.         0.00183397 0.00144581]
 [0.         0.00092551 0.         ... 0.         0.         0.        ]
 [0.00575352 0.         0.         ... 0.         0.         0.        ]]
