# week6_hw PCA and K-means

In [1]:
import jieba
import os
import pandas as pd
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import timedelta, date
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA as sklearnPCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
import matplotlib.pyplot as plt
import plotly
plotly.tools.set_credentials_file(username='hsiehkl', api_key='FXXF2GQeCytUbJX9YoB2')
import plotly.plotly as py
import plotly.graph_objs as go

In [3]:
def is_chinese(uchar):         
    if u'\u4e00' <= uchar<=u'\u9fff':
        return True
    else:
        return False

In [4]:
# 專有名詞
jieba.load_userdict('ProperN.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/bf/kxrhqhkd3s1263kmtf84x8zw0000gn/T/jieba.cache
Loading model cost 0.677 seconds.
Prefix dict has been built succesfully.


In [5]:
folders = ["大同20170103/", "大同20180102/", "大同20180709/"]

In [6]:
def get_txts(folder):
    all_files = os.listdir("News/" + folder)   # imagine you're one directory above test dir
    return all_files

In [7]:
# 讀取新聞檔案，並使用jieba切詞
def textMining(folder, file):
    with open("./News/" + folder + file, 'rt',encoding="utf-8") as txt:
        data = txt.read()
    #text segmentation
    seg_list = jieba.cut(data, cut_all=False)
    seg_list = list(seg_list)
    filter_list = []
    wordDict = {}
    jump = False
    for word in seg_list:
        for s in word:
            if not is_chinese(s):
                jump = True
                break
        if not jump:
            filter_list.append(word)
        jump = False
    return filter_list

In [8]:
def analyze(folder, corpus, txts_in_folder):
    # tfidf
    vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 0.2)
    tfidf = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names()
    print("tfidf.shape: ", tfidf.shape)
    for i in range(len(corpus)):
        print('----{0} NEWS----'.format(txts_in_folder[i]))
        for j in range(len(words)):
            if tfidf[i,j] > 0.2:
                print(words[j], tfidf[i,j], i ,j)
    
    X = tfidf.toarray()
    len(X)
    X_std = StandardScaler().fit_transform(X)
    sklearn_pca = sklearnPCA(n_components = 2)
    Y_sklearn = sklearn_pca.fit_transform(X_std)
    Y_sklearn[:3]
    
    return Y_sklearn

In [9]:
def plot_cpa_scatter(Y_sklearn, txts_in_folder):
    pca_data = [
        go.Scatter(
            x = Y_sklearn[:,0],
            y = Y_sklearn[:,1],
            mode = "markers",
            hoverinfo = 'text',
            text = txts_in_folder
        )
    ]

    fig = go.Figure(data = pca_data)
    return py.iplot(fig, filename = 'PCA Scatter Chart')

In [15]:
def plot_kmeans_scatter(Y_sklearn, txts_in_folder):
    kmeans = KMeans(n_clusters = 3)
    X_clustered = kmeans.fit_predict(Y_sklearn)
    
    #Define our own color map
    LABEL_COLOR_MAP = {0:'red', 1: 'green', 2: 'blue'}
    label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
    
    pca_data = [
        go.Scatter(
            x = Y_sklearn[:,0],
            y = Y_sklearn[:,1],
            mode = "markers",
            hoverinfo = 'text',
            text = txts_in_folder,
            marker = dict(color = label_color)
        )
    ]

    fig = go.Figure(data = pca_data)
    return py.iplot(fig, filename = 'K-means Scatter Chart')

###  分析大同20180102資料夾

In [11]:
# get corpus from folder 大同20180102
txts_in_folder1 = get_txts(folders[1])
corpus1 = []

for file_name in txts_in_folder1:
    filter_list = textMining(folders[1], file_name)
    join_list = " ".join(filter_list)
    corpus1.append(join_list)

Y_sklearn1 = analyze(folders[1], corpus1, txts_in_folder1)

tfidf.shape:  (20, 185)
----大同今年展望_180226_經濟日報.txt NEWS----
今年 0.23048478393688057 0 12
去年 0.3250049774609603 0 26
太陽能 0.5712232323965375 0 48
資金 0.4284174242974031 0 161
郭文艷 0.29837078787496224 0 178
----大同換董座_180203_經濟日報.txt NEWS----
人士 0.2298169535210584 1 11
今年 0.20046889587003497 1 12
出現 0.24841659552937362 1 24
市場 0.28675871713329565 1 58
每股 0.3726248932940604 1 95
營運 0.24841659552937362 1 101
經營權 0.21409118027436747 1 127
股價 0.266557160732481 1 132
----大同現增價14元_180110_工商時報.txt NEWS----
價格 0.446099750441074 2 16
媒體 0.26999743383160724 2 50
林宏信 0.33457481283080553 2 87
每股 0.33457481283080553 2 95
發行 0.3095242652601372 2 112
虧損 0.24028694089586644 2 145
----大同現增價格市場派質疑_180104_udn新聞網.txt NEWS----
價格 0.40303754064179687 3 16
公司 0.4775666375198697 3 20
增資 0.20151877032089843 3 44
每股 0.20151877032089843 3 95
為何 0.20151877032089843 3 100
發行 0.27964574854852614 3 112
股東 0.31578187403186836 3 134
----大同現增股東不滿_180107_鉅亨網.txt NEWS----
公司 0.22445079051831848 4 20
去年 0.2566064208412195 4 26
投

In [12]:
plot_cpa_scatter(Y_sklearn1, txts_in_folder1)

In [16]:
plot_kmeans_scatter(Y_sklearn1, txts_in_folder1)

## 分析大同20180709資料夾

In [14]:
# get corpus from folder 大同20180102
txts_in_folder2 = get_txts(folders[2])
corpus2 = []

for file_name in txts_in_folder2:
    filter_list = textMining(folders[2], file_name)
    join_list = " ".join(filter_list)
    corpus2.append(join_list)

Y_sklearn2 = analyze(folders[2], corpus1, txts_in_folder2)

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa6 in position 4: invalid start byte

In [None]:
plot_cpa_scatter(Y_sklearn2, txts_in_folder2)

In [None]:
plot_kmeans_scatter(Y_sklearn2, txts_in_folder2)