# week6_hw PCA and K-means

In [1]:
import jieba
import os
import pandas as pd
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import timedelta, date
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA as PCA
#from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [2]:
import matplotlib.pyplot as plt
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

In [3]:
def is_chinese(uchar):         
    if u'\u4e00' <= uchar<=u'\u9fff':
        return True
    else:
        return False

In [4]:
# 專有名詞
jieba.load_userdict('ProperN.txt')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/bf/kxrhqhkd3s1263kmtf84x8zw0000gn/T/jieba.cache
Loading model cost 0.680 seconds.
Prefix dict has been built succesfully.


In [5]:
folders = ["中電/", "金雨/", "大同/", "中興電/", "聲寶/", "東元/"]

In [6]:
imp_words = ["經營權",  "董事會",  "請辭", "接任", "出任", "兼任", "辭去", "入主", "推選", "另聘"]

In [7]:
def get_txts(folder):
    all_files = []
    for f in os.listdir("News/" + folder):
        if not f.startswith('.'):
            all_files.append(f)
    #os.listdir("News/" + folder)   # imagine you're one directory above test dir
    return all_files

In [8]:
# 讀取新聞檔案，並使用jieba切詞
def textMining(folder, file):
    with open("./News/" + folder + file, 'rt',encoding="utf-8", errors='ignore') as txt:
        data = txt.read()
    #text segmentation
    seg_list = jieba.cut(data, cut_all=False)
    seg_list = list(seg_list)
    filter_list = []
    wordDict = {}
    jump = False
    for word in seg_list:
        for s in word:
            if not is_chinese(s):
                jump = True
                break
        if not jump:
            filter_list.append(word)
        jump = False
    return filter_list

In [9]:
def analyze(folder, corpus, txts_in_folder):
    # tfidf
    vectorizer = TfidfVectorizer(max_df = 0.9, min_df = 0.2)
    tfidf = vectorizer.fit_transform(corpus)
    words = vectorizer.get_feature_names()
    print("tfidf.shape: ", tfidf.shape)
    for i in range(len(corpus)):
        print('----{0} NEWS----'.format(txts_in_folder[i]))
        for j in range(len(words)):
            if words[j] in imp_words:
                tfidf[i,j] = tfidf[i,j] * 2
            if tfidf[i,j] > 0.2:
                print(words[j], tfidf[i,j], i ,j)
    
    X = tfidf.toarray()
    sklearn_pca = PCA(n_components = 2)
    Y_sklearn = sklearn_pca.fit_transform(X)
    
    return Y_sklearn

In [10]:
def plot_cpa_scatter(Y_sklearn, txts_in_folder):
    pca_data = [
        go.Scatter(
            x = Y_sklearn[:,0],
            y = Y_sklearn[:,1],
            mode = "markers",
            hoverinfo = 'text',
            text = txts_in_folder
        )
    ]

    fig = go.Figure(data = pca_data)
    return py.iplot(fig, filename = 'PCA Scatter Chart')

In [11]:
def plot_kmeans_scatter(Y_sklearn, txts_in_folder):
    kmeans = KMeans(n_clusters = 3)
    X_clustered = kmeans.fit_predict(Y_sklearn)
    
    #Define our own color map
    LABEL_COLOR_MAP = {0:'red', 1: 'green', 2: 'blue'}
    label_color = [LABEL_COLOR_MAP[l] for l in X_clustered]
    
    pca_data = [
        go.Scatter(
            x = Y_sklearn[:,0],
            y = Y_sklearn[:,1],
            mode = "markers",
            hoverinfo = 'text',
            text = txts_in_folder,
            marker = dict(color = label_color)
        )
    ]

    fig = go.Figure(data = pca_data)
    return py.iplot(fig, filename = 'K-means Scatter Chart')

###  分析中電

In [12]:
# get corpus from folder 中電
txts_in_folder0 = get_txts(folders[0])
corpus0 = []

for file_name in txts_in_folder0:
    filter_list = textMining(folders[0], file_name)
    join_list = " ".join(filter_list)
    corpus0.append(join_list)

Y_sklearn0 = analyze(folders[0], corpus0, txts_in_folder0)

tfidf.shape:  (16, 101)
----20170303_1611_投保中心受理中電求償登記.txt NEWS----
利益 0.4729411124893359 0 11
市場 0.7459663196865982 0 29
指出 0.32552613792342705 0 35
符合 0.20069096455421243 0 67
資訊 0.21708295112169412 0 87
----20170327_1611_中電董事會決議出售台南樹谷廠土地.txt NEWS----
中國 0.24171695243718527 1 0
億元 0.4219235315104067 1 6
出售 0.30649610143579276 1 10
利益 0.30649610143579276 1 11
董事 0.4597441521536891 1 77
通過 0.28136728421332274 1 90
電器 0.2601211719805474 1 96
----20170412_1611_中電參加台灣科技展.txt NEWS----
中國 0.23718641370856203 2 0
今日 0.20050093130891167 2 4
參加 0.2552456800801645 2 14
展出 0.27609357310103627 2 26
智慧 0.2858743140627039 2 43
科技展 0.27609357310103627 2 65
系統 0.368124764134715 2 69
路燈 0.368124764134715 2 89
電力 0.20050093130891167 2 95
電器 0.2552456800801645 2 96
----20170628_1611_現任董座周麗真續掌中電經營權.txt NEWS----
市場 0.35381137716406313 3 29
董事 0.8075364676404763 3 77
----20170628_1611_電纜股市場派遭解任.txt NEWS----
今天 0.2645589644518114 4 2
市場 0.5303136167638954 4 29
董事 0.2881866363073466 4 77
通過 0.264558964451811

In [13]:
#plot_cpa_scatter(Y_sklearn0, txts_in_folder0)

In [14]:
plot_kmeans_scatter(Y_sklearn0, txts_in_folder0)

### 分析金雨

In [15]:
# get corpus from folder 金雨
txts_in_folder1 = get_txts(folders[1])
corpus1 = []

for file_name in txts_in_folder1:
    filter_list = textMining(folders[1], file_name)
    join_list = " ".join(filter_list)
    corpus1.append(join_list)

Y_sklearn1 = analyze(folders[1], corpus1, txts_in_folder1)

tfidf.shape:  (8, 187)
----20180223_4503_金雨世代交替.txt NEWS----
卓政 0.22492339831047106 0 36
接任 0.23782893162536808 0 85
業務 0.23782893162536808 0 111
自動 0.2787344127737447 0 158
董事長 0.22168077759790092 0 163
販賣機 0.238915210948924 0 176
----20180412_4503_借殼上市入主金雨.txt NEWS----
上市 0.26208385574944104 1 0
上櫃 0.2429745311717055 1 2
借殼 0.26208385574944104 1 20
兄弟 0.5241677114988821 1 24
入主 0.485949062343411 1 25
持股 0.2096670845995528 1 84
----20180412_4503_炒作金雨企業股票遭訴.txt NEWS----
上櫃 0.4502245863998873 2 2
交易 0.22200393006575206 2 11
兄弟 0.22200393006575206 2 24
卓燦然 0.3445950316321042 2 38
興亞 0.27750491258219007 2 160
----20180413_4503_金雨前董座涉炒股.txt NEWS----
兄弟 0.3046186344917602 3 24
入主 0.5295149453942358 3 25
卓燦然 0.413726051888636 3 38
董事 0.23641488679350625 3 162
起訴 0.23641488679350625 3 177
----20180530_4503_受理金雨股價操縱案.txt NEWS----
中心 0.29183570161946526 4 7
交易 0.2218997055946689 4 11
交易法 0.2218997055946689 4 12
台北 0.25715041079041717 4 49
投保 0.3214380134880215 4 80
投資人 0.243196418016221 4 82
求償


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.



In [16]:
plot_kmeans_scatter(Y_sklearn1, txts_in_folder1)

### 分析大同

In [17]:
# get corpus from folder 大同
txts_in_folder2 = get_txts(folders[2])
corpus2 = []

for file_name in txts_in_folder2:
    filter_list = textMining(folders[2], file_name)
    join_list = " ".join(filter_list)
    corpus2.append(join_list)

Y_sklearn2 = analyze(folders[2], corpus2, txts_in_folder2)

tfidf.shape:  (11, 47)
----237120180222.txt NEWS----
----237120180313.txt NEWS----
大同 0.46160774142622 1 10
掩埋 0.7581897554226065 1 15
未來 0.27570536560822057 1 19
發電 0.2067790242061654 1 23
----237120180324.txt NEWS----
中資 0.2275792297082095 2 0
公司 0.3490993735956248 2 3
大同 0.7620623123483616 2 10
管理 0.20338093553226538 2 25
經營 0.2275792297082095 2 26
----237120180325.txt NEWS----
完成 0.5773502691896258 3 11
發電 0.5773502691896258 3 23
綠能 0.5773502691896258 3 27
----237120180326.txt NEWS----
公司 0.3239750590763594 4 3
台北市 0.5280014801363824 4 7
完成 0.4400012334469854 4 11
本案 0.2640007400681912 4 20
發電 0.3520009867575883 4 23
開發 0.2640007400681912 4 42
----237120180328.txt NEWS----
大同 0.6049887900246818 5 10
市場 0.26910132349191745 5 12
股東 0.2509325327268058 5 28
蔚山 0.6022380785443339 5 30
----237120180330.txt NEWS----
大同 0.3827502011865852 6 10
指出 0.3073964247418583 6 14
股東 0.5715149177780524 6 28
董事 0.3073964247418583 6 29
----237120180621.txt NEWS----
大同 0.2822782610798094 7 10
提出 0.47084

In [18]:
plot_kmeans_scatter(Y_sklearn2, txts_in_folder2)

### 分析中興電 

In [19]:
# get corpus from folder 中興電
txts_in_folder3 = get_txts(folders[3])
corpus3 = []

for file_name in txts_in_folder3:
    filter_list = textMining(folders[3], file_name)
    join_list = " ".join(filter_list)
    corpus3.append(join_list)

Y_sklearn3 = analyze(folders[3], corpus3, txts_in_folder3)

tfidf.shape:  (7, 149)
----201803291513風電遴選題材發酵.txt NEWS----
上緯 0.29110420070934484 0 0
合作 0.27941102030537357 0 41
建置 0.26150852282160103 0 72
簽署 0.24258683392445404 0 106
遴選 0.24258683392445404 0 130
風電 0.3922627842324015 0 148
----201803311513中興電 配息率達86.21％.txt NEWS----
中興電 0.40273626274914875 1 4
今年 0.23193540281694808 1 11
億元 0.5291785562472412 1 16
去年 0.2713434697135612 1 34
工商 0.46387080563389615 1 67
時報 0.46387080563389615 1 86
----201804051513風電開發商 集結台灣打前哨戰.txt NEWS----
台灣 0.3452965903156897 2 39
日本 0.20717795418941382 2 84
發展 0.20717795418941382 2 101
開發 0.3556733341498701 2 134
離岸 0.20717795418941382 2 137
離岸風電 0.40404189863107576 2 138
----201804091513離岸風電題材續航 華城、大亞、中興電股價齊創波段新高.txt NEWS----
受惠 0.21864004060741668 3 36
大亞 0.24338416804679144 3 60
建置 0.21864004060741668 3 72
成為 0.21864004060741668 3 76
經濟部 0.2803293881204874 3 110
設備 0.21864004060741668 3 124
開發 0.2502339354453587 3 134
離岸風電 0.28426363371021274 3 138
----201806261513外商啟動離岸風電在地佈局 吹響與台廠簽約號角.txt NEWS----
供應商 0.2

In [20]:
plot_kmeans_scatter(Y_sklearn3, txts_in_folder3)

### 分析聲寶

In [21]:
# get corpus from folder 聲寶
txts_in_folder4 = get_txts(folders[4])
corpus4 = []

for file_name in txts_in_folder4:
    filter_list = textMining(folders[4], file_name)
    print(filter_list)
    join_list = " ".join(filter_list)
    corpus4.append(join_list)

Y_sklearn4 = analyze(folders[4], corpus4, txts_in_folder4)

['聲寶', '集團', '昨日', '舉行', '台南', '廠', '新建', '工程', '動土', '第一期', '將斥資', '億元', '作為', '節能', '健康', '及', '環保', '的', '物聯網', '智能', '生活', '家電', '以及', '人工智慧', '相關', '產品', '生產據點', '該廠', '預計', '年', '完工', '挹注', '未來', '聲寶業績', '動能', '第二期', '規劃則', '增設', '研發', '大樓', '及', '物流', '專用', '倉庫', '等', '設施', '一', '二期', '合計', '投資', '超過', '億元', '聲寶總裁', '陳', '盛', '沺', '昨日', '主持', '台南', '廠', '新建', '工程', '動土', '儀式', '他', '說', '投資', '越南', '不如', '投資', '台南', '投資', '台灣', '也', '才能', '創造', '台灣', '更', '大', '的', '利基', '他', '強調', '包括', '他', '自己', '與', '公司', '總', '經理', '副', '總', '經理', '都', '是', '成大', '畢業', '深知', '台南', '擁有', '許多', '人才', '為', '因應', '未來', '公司', '成長', '需求', '台南', '廠將', '以', '少量多', '樣化', '的', '高效率', '生產', '結', '合成', '大等', '當地', '學校', '高端', '人才', '聲寶', '表示', '台南', '廠有', '二大', '功能', '一為', '促進', '南台', '灣經濟', '發展', '創造', '當地', '就業', '機會', '二是強', '化生', '產供', '應鏈', '及', '深耕', '在', '地', '客戶', '及', '擴大市', '占', '聲寶', '去年', '選定', '台南', '科技', '工業', '園區', '作為', '新建', '廠', '房基', '地', '占地', '坪', '為', '集團', '布局', '南台', '灣的', '重要',

In [22]:
plot_kmeans_scatter(Y_sklearn4, txts_in_folder4)

### 分析東元 

In [23]:
# get corpus from folder 東元
txts_in_folder5 = get_txts(folders[5])
corpus5 = []

for file_name in txts_in_folder5:
    filter_list = textMining(folders[5], file_name)
    join_list = " ".join(filter_list)
    corpus5.append(join_list)

Y_sklearn5 = analyze(folders[5], corpus5, txts_in_folder5)

tfidf.shape:  (37, 105)
----20180110.txt NEWS----
去年 0.4643988080864487 0 12
效率 0.32431652605851097 0 49
營收 0.5952960193562552 0 60
發電 0.2976480096781276 0 65
----20180118.txt NEWS----
太陽能 0.7184257532650448 1 25
建置 0.41052900186573993 1 33
綠能 0.20526450093286996 1 75
----20180125.txt NEWS----
工業 0.3056613828689828 2 29
市場 0.22152048069189506 2 30
希望 0.3302976252035096 2 32
推出 0.412872031504387 2 43
提供 0.22104925182059398 2 45
產業 0.2750398406350506 2 63
製造 0.22104925182059398 2 81
透過 0.26991859903720866 2 88
需求 0.2292460371517371 2 99
領域 0.2292460371517371 2 102
----20180220.txt NEWS----
主要 0.5082365763095416 3 1
布局 0.5287795155334365 3 31
相關 0.4718423925076823 3 68
領域 0.48933890412065734 3 102
----20180227.txt NEWS----
企業 0.5832937117812887 4 4
東元電機 0.3544001880564301 4 57
相關 0.3328276279401698 4 68
記者 0.40640882660575645 4 84
----20180301.txt NEWS----
公司 0.22731650991884458 5 7
可以 0.2109390081285087 5 14
太陽能 0.5978217487644428 5 25
投資 0.22880452325651912 5 40
政府 0.20309569969052427 5

In [24]:
plot_kmeans_scatter(Y_sklearn5, txts_in_folder5)