In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import collections
import re

In [2]:
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [90]:
f = open('archive/business/business_6.txt', 'r', encoding='UTF-8')

text = f.read()
print(text)

f.close()

US adds more jobs than expected

The US economy added 337,000 jobs in October - a seven-month high and far more than Wall Street expectations.

In a welcome economic boost for newly re-elected President George W Bush, the Labor Department figures come after a slow summer of weak jobs gains. Jobs were created in every sector of the US economy except manufacturing. While the separate unemployment rate went up to 5.5% from 5.4% in September, this was because more people were now actively seeking work.

The 337,000 new jobs added to US payrolls in October was twice the 169,000 figure that Wall Street economists had forecast. In addition, the Labor Department revised up the number of jobs created in the two previous months - to 139,000 in September instead of 96,000, and to 198,000 in August instead of 128,000. The better than expected jobs data had an immediate upward effect on stocks in New York, with the main Dow Jones index gaining 45.4 points to 10,360 by late morning trading. "It look

In [92]:
from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)
print(tokens)

['US', 'adds', 'more', 'jobs', 'than', 'expected', 'The', 'US', 'economy', 'added', '337,000', 'jobs', 'in', 'October', '-', 'a', 'seven-month', 'high', 'and', 'far', 'more', 'than', 'Wall', 'Street', 'expectations', '.', 'In', 'a', 'welcome', 'economic', 'boost', 'for', 'newly', 're-elected', 'President', 'George', 'W', 'Bush', ',', 'the', 'Labor', 'Department', 'figures', 'come', 'after', 'a', 'slow', 'summer', 'of', 'weak', 'jobs', 'gains', '.', 'Jobs', 'were', 'created', 'in', 'every', 'sector', 'of', 'the', 'US', 'economy', 'except', 'manufacturing', '.', 'While', 'the', 'separate', 'unemployment', 'rate', 'went', 'up', 'to', '5.5', '%', 'from', '5.4', '%', 'in', 'September', ',', 'this', 'was', 'because', 'more', 'people', 'were', 'now', 'actively', 'seeking', 'work', '.', 'The', '337,000', 'new', 'jobs', 'added', 'to', 'US', 'payrolls', 'in', 'October', 'was', 'twice', 'the', '169,000', 'figure', 'that', 'Wall', 'Street', 'economists', 'had', 'forecast', '.', 'In', 'addition', '

## 前処理編

### 例 : ストップワードリストの作成

### nltkのストップワードリスト

In [93]:
from nltk.corpus import stopwords

en_stop = stopwords.words('english')
print(en_stop[0:10])
print(len(en_stop))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
179


### 例:【発展】記号や数字は正規表現で消してみる

In [94]:
en_stop= ["``","/",",.",".,",";","--",":",")","(",'"','&',"'",'),',',"','-','.,','.,"','.-',"?",">","<","$","%"]                  \
         +["say"] \
         +en_stop

In [95]:
print(tokens[38])
print(tokens[32])
pattern1 = '.*\d'
res = re.match(pattern1, tokens[32])
if res:
    print("マッチしました。")
else:
    print("マッチしませんでした。")

,
newly
マッチしませんでした。


### 前処理関数の作成

In [96]:
from nltk.corpus import wordnet as wn #lemmatize関数のためのimport

def preprocess_word(word, stopwordset):
    
    #1.make words lower ex: Python =>python
    word=word.lower()
    
    #2.remove "," and "."
    if word in [",","."]:
        return None
    
    #3.remove stopword  ex: the => (None) 
    if word in stopwordset:
        return None
    
    pattern1 = '.*\d'
    res = re.match(pattern1, word)
    if res:
        return None
    
    pattern2 = '.*\''
    res = re.match(pattern2, word)
    if res:
        return None
    
    #4.lemmatize  ex: cooked=>cook
    lemma = wn.morphy(word)
    if lemma is None:
        return word

    elif lemma in stopwordset: #lemmatizeしたものがstopwordである可能性がある
        return None
    else:
        return lemma

def preprocess_document(document):
    document=[preprocess_word(w, en_stop) for w in document]
    document=[w for w in document if w is not None]
    return document

def preprocess_documents(documents):
    return [preprocess_document(document) for document in documents]

In [97]:
print(preprocess_document(tokens))

['us', 'add', 'job', 'expect', 'us', 'economy', 'add', 'job', 'october', 'seven-month', 'high', 'far', 'wall', 'street', 'expectation', 'welcome', 'economic', 'boost', 'newly', 're-elected', 'president', 'george', 'w', 'bush', 'labor', 'department', 'figure', 'come', 'slow', 'summer', 'weak', 'job', 'gain', 'job', 'create', 'every', 'sector', 'us', 'economy', 'except', 'manufacturing', 'separate', 'unemployment', 'rate', 'go', 'september', 'people', 'actively', 'seeking', 'work', 'new', 'job', 'add', 'us', 'payroll', 'october', 'twice', 'figure', 'wall', 'street', 'economist', 'forecast', 'addition', 'labor', 'department', 'revise', 'number', 'job', 'create', 'two', 'previous', 'month', 'september', 'instead', 'august', 'instead', 'better', 'expect', 'job', 'data', 'immediate', 'upward', 'effect', 'stocks', 'new', 'york', 'main', 'dow', 'jones', 'index', 'gain', 'point', 'late', 'morning', 'trading', 'look', 'like', 'job', 'situation', 'improve', 'support', 'consumer', 'spending', 'goi

### 前処理の結果を出力してみる

### 前処理前

In [10]:
print(docs[0][:25]) 

['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears']


### 前処理後

In [11]:
print(preprocess_documents(docs)[0][:25])

['asian', 'exporter', 'fear', 'damage', 'japan', 'rift', 'mounting', 'trade', 'friction', 'japan', 'raise', 'fear', 'among', 'many', 'asia', 'exporting', 'nation', 'row', 'could', 'inflict', 'far', 'reaching', 'economic', 'damage', 'businessmen']


## クラスタリング編

### tf idfで上記の前処理済みの文章をベクトル化
### vectorizerを使用する（ハイパーパラメーターの設定）

In [12]:
pre_docs=preprocess_documents(docs)
pre_docs=[" ".join(doc) for doc in pre_docs]
print(pre_docs[0])

vectorizer = TfidfVectorizer(max_features=200, token_pattern=u'(?u)\\b\\w+\\b' )



### fitする

In [13]:
tf_idf = vectorizer.fit_transform(pre_docs)

### K-means
### kmeansの設定

In [14]:
num_clusters = 8
km = KMeans(n_clusters=num_clusters, random_state = 0)

### fitする

In [15]:
clusters = km.fit_predict(tf_idf)



### 出力結果

In [16]:
for doc, cls in zip(pre_docs, clusters):
    print(cls,doc)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## 応用
クラスタリング編でコードを以下に指示に従って変更する事で結果がどの様に変わるのかを確認してみましょう．<br>
    （１）講義で学んだ他の手法でベクトル化してみる(例：bag-of-words)<br>
    （２）kmeans以外の手法、又はkmeansを可視化してみる(例：階層型クラスタリング)


## ヒント

scikit-learnのvectorizerとkmeansにはたくさんのハイパーパラメータがあります。vectorizerのハイパーパラメータの中には前処理機能(例：stop_words)もあります。
    ハイパーパラメータの設定を変える事で最終的な結果は変わります。以下のURLにアクセスしてハイパーパラメータの独自で設定してみてください。<br>
    ・TF-IDFに関するパラメータ<br>
    https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html<br>
    ・Kmeansに関するパラメータ<br>
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html<br>

