In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install -U ckiptagger[tf,gdown]

In [1]:
import pandas as pd
import numpy as np
from ckiptagger import WS, POS, NER
from ckiptagger import construct_dictionary
from tqdm.notebook import tqdm

In [None]:
from ckiptagger import data_utils, WS
data_utils.download_data_gdown("./")

Downloading...
From: https://drive.google.com/uc?id=1efHsY16pxK0lBD2gYCgCTnv1Swstq771
To: /content/data.zip
1.88GB [00:18, 100MB/s]


In [2]:
df_train = pd.read_csv('/content/drive/MyDrive/NLP/news_clustering_train.tsv', sep='\t')
df_test = pd.read_csv('/content/drive/MyDrive/NLP/news_clustering_test.tsv', sep='\t')

In [3]:
df_train

Unnamed: 0,index,class,title
0,0,體育,亞洲杯奪冠賠率：日本、伊朗領銜 中國竟與泰國並列
1,1,體育,9輪4球本土射手僅次武磊 黃紫昌要搶最強U23頭銜
2,2,體育,如果今年勇士奪冠，下賽季詹姆斯何去何從？
3,3,體育,超級替補！科斯塔本賽季替補出場貢獻7次助攻
4,4,體育,騎士6天里發生了啥？從首輪搶七到次輪3-0猛龍
...,...,...,...
1795,1795,遊戲,LOL：麻辣香鍋韓服Rank不合成打野刀？電刀巨魔新套路連勝中
1796,1796,遊戲,穩住，我們能贏！因為我們擁有這種強大的力量
1797,1797,遊戲,騰訊是怎樣毀掉《鬥戰神》這款可能成神的作品的？
1798,1798,遊戲,LOL你不知道的黑科技打法！


In [4]:
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

test_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
test_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

In [5]:
all_news_class = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

# 斷詞 + POS

In [6]:
ws = WS('./data/')
pos = POS('./data/')



In [10]:
train_title_cuts = {}
for index, title in tqdm(train_titles.items()):
    word_s  = ws([title],sentence_segmentation = True) # To consider delimiters)
    word_p = pos(word_s)

    # END YOUR CODE
    train_title_cuts[index] = list(zip(word_s[0], word_p[0]))

HBox(children=(FloatProgress(value=0.0, max=1800.0), HTML(value='')))




In [11]:
test_title_cuts = {}
for index, title in tqdm(test_titles.items()):
    word_s  = ws([title],sentence_segmentation = True) # To consider delimiters)
    word_p = pos(word_s)
    
    test_title_cuts[index] = list(zip(word_s[0], word_p[0]))

HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))




In [12]:
train_title_cuts[120]

[('國腳', 'Na'),
 ('張呈棟', 'Nb'),
 ('：', 'COLONCATEGORY'),
 ('從', 'D'),
 ('沒', 'D'),
 ('想', 'VE'),
 ('過', 'Di'),
 ('自己', 'Nh'),
 ('會', 'D'),
 ('出', 'VC'),
 ('一', 'Neu'),
 ('本', 'Nf'),
 ('書', 'Na')]

# Bag of Words (BOW)

In [13]:
word2index = {}
index2word = {}
# 產生字與index對應的關係
idx = 0
for i in range(len(train_title_cuts)):
    for word in train_title_cuts[i]:
        if word[0] not in word2index:
            word2index[word[0]] = idx
            index2word[idx] = word[0]
            idx+=1
        else:
            pass

In [14]:
word2index['溫暖']

1512

In [15]:
index2word[1512]

'溫暖'

In [16]:
def get_bow_vector(pairs, word2index):
    vector = np.zeros(shape=(len(word2index)))

    for word, _ in pairs:
        if word in word2index:
            word_id = word2index[word]
            vector[word_id]+=1
        else: 
            pass

    # END YOUR CODE
    return vector

In [17]:
get_bow_vector(train_title_cuts[120], word2index)

array([0., 0., 1., ..., 0., 0., 0.])

# 排除較無意義的詞性

In [18]:
pos_analysis = {}
for _, pairs in train_title_cuts.items():

    for word, flag in pairs:
        if flag not in pos_analysis:
            pos_analysis[flag] = set()
        pos_analysis[flag].add(word)

for flag, words in pos_analysis.items():
    print(flag, ':', list(words)[:100])
    print('=======================')

Nb : ['傑森', '唐培科', '老詹', '克羅托內 ', '本澤馬', '吉尼斯', '圍甲', '卡頓', '堯治河', '京新', '何雯娜', '巴爾韋德', '劉集', '拳皇', '切爾西', '詹姆斯', '昂科威', '史森明', '紫鑫', '德安東尼', '華為', '峽谷里', '安卓', '蓋倫', '馬競', '宇通', '高曉松', '台積電', '章澤天', '董明珠', '埃爾克森', '阿木', '姆巴佩', '劉歡', '沙特', '楊柳夏', '多浪', '老馬', 'S11', '瓜迪奧拉', '博時', '小明', '伊亞', '韋德', '英超', '樊振東', '瓦基弗', '高傭', '亨德森', '內馬爾', '霍金斯', '阿里雲', '子女農轉非', '西蒙斯', '郭春林', '蘇群', '劉偉', '2.0', '何享健', '騰訊', '卓爾', '喬丹', '劉德華', '波爾', '安徒恩比盧克', '德比', '張良咸', '詹皇', '伏爾加格勒', '鄭眼', '哈登', '馮天睿', '保羅', '金秀瑤族', '朱婷', '大神', '李暉', '布茲德里克', '沃爾瑪', '林員', '皮爾洛', '拜仁續', '大聶', '胤禛', '張大仙', '阿里舜宇', '武磊', '羅傑斯', '郭煒煒', '艷武漢', '小詹皇', '梅西納', '密子君', '桑德羅', '浦發', '穆里尼奧', '陶華碧', '萊昂納德', '梁建章', '雷軍']
Na : ['命運', '瓜', '新機', '越南語', '同行', '樹葉', '鐵漢', '迷宮', '名錄', '助教', '增幅', '排位', '感情', '接班人', '黑桃樹', '安全性', '高端', '搏擊', '皇后', '風光', '美', '戰火箭', '西紅柿', '大錢', '鵜鶘', '展品', '消費者', '逆市', '門', '標題黨', '地位', '剛性', '平台', '行情', '規劃', '總量', '真因', '人才', '水漂', '飲料', '淘寶客', '跆拳道', '因素', '錢包', '損失', '千姿百色', 'Bu', '樹莓', '青年', '莊園', '兒童'

|         Type        |     Description    |
|:-------------------:|:------------------:|
| A                   | 非謂形容詞         |
| Caa                 | 對等連接詞         |
| Cab                 | 連接詞，如：等等   |
| Cba                 | 連接詞，如：的話   |
| Cbb                 | 關聯連接詞         |
| D                   | 副詞               |
| Da                  | 數量副詞           |
| Dfa                 | 動詞前程度副詞     |
| Dfb                 | 動詞後程度副詞     |
| Di                  | 時態標記           |
| Dk                  | 句副詞             |
| DM                  | 定量式             |
| I                   | 感嘆詞             |
| Na                  | 普通名詞           |
| Nb                  | 專有名詞           |
| Nc                  | 地方詞             |
| Ncd                 | 位置詞             |
| Nd                  | 時間詞             |
| Nep                 | 指代定詞           |
| Neqa                | 數量定詞           |
| Neqb                | 後置數量定詞       |
| Nes                 | 特指定詞           |
| Neu                 | 數詞定詞           |
| Nf                  | 量詞               |
| Ng                  | 後置詞             |
| Nh                  | 代名詞             |
| Nv                  | 名物化動詞         |
| P                   | 介詞               |
| T                   | 語助詞             |
| VA                  | 動作不及物動詞     |
| VAC                 | 動作使動動詞       |
| VB                  | 動作類及物動詞     |
| VC                  | 動作及物動詞       |
| VCL                 | 動作接地方賓語動詞 |
| VD                  | 雙賓動詞           |
| VF                  | 動作謂賓動詞       |
| VE                  | 動作句賓動詞       |
| VG                  | 分類動詞           |
| VH                  | 狀態不及物動詞     |
| VHC                 | 狀態使動動詞       |
| VI                  | 狀態類及物動詞     |
| VJ                  | 狀態及物動詞       |
| VK                  | 狀態句賓動詞       |
| VL                  | 狀態謂賓動詞       |
| V_2                 | 有                 |
|                     |                    |
| DE                  | 的之得地           |
| SHI                 | 是                 |
| FW                  | 外文               |
|                     |                    |
| COLONCATEGORY       | 冒號               |
| COMMACATEGORY       | 逗號               |
| DASHCATEGORY        | 破折號             |
| DOTCATEGORY         | 點號               |
| ETCCATEGORY         | 刪節號             |
| EXCLAMATIONCATEGORY | 驚嘆號             |
| PARENTHESISCATEGORY | 括號               |
| PAUSECATEGORY       | 頓號               |
| PERIODCATEGORY      | 句號               |
| QUESTIONCATEGORY    | 問號               |
| SEMICOLONCATEGORY   | 分號               |
| SPCHANGECATEGORY    | 雙直線             |
| WHITESPACE          | 空白               |

In [19]:
def get_bow_vector_with_selection(pairs, word2index):
    included_flags = ['VAC','Nv','VF','VI','A','Ncd','VK',\
              'VHC','VE','FW','VCL','VC','VA','Nd',\
              'VH','VD','VJ','Nf','VG','VB','Nc','Na','Nb']
    vector = np.zeros(len(word2index))
    for word, flag in pairs:
        if word in word2index and flag in included_flags:
            vector[word2index[word]] += 1
    return vector

# Cosine Similarity

In [20]:
def cosine_similarity(bow1, bow2):

    similarity = np.dot(bow1,bow2)/(np.linalg.norm(bow1)*np.linalg.norm(bow2))
    
    return similarity

In [21]:
bow1 = get_bow_vector(train_title_cuts[100], word2index)
bow2 = get_bow_vector(train_title_cuts[130], word2index)
cosine_similarity(bow1, bow2)

0.08703882797784893

In [22]:
train_title_cuts[100]

[('山東', 'Nc'),
 ('魯能', 'Nb'),
 ('有沒有', 'D'),
 ('可能', 'D'),
 ('拿到', 'VC'),
 ('今年', 'Nd'),
 ('的', 'DE'),
 ('中', 'A'),
 ('超', 'A'),
 ('冠軍', 'Na'),
 ('？', 'QUESTIONCATEGORY')]

In [23]:
train_title_cuts[130]

[('NBA', 'Nb'),
 ('和', 'Caa'),
 ('CBA', 'FW'),
 ('差距', 'Na'),
 ('在', 'P'),
 ('哪裡', 'Ncd'),
 ('？', 'QUESTIONCATEGORY'),
 ('6', 'Neu'),
 ('張', 'Nf'),
 ('圖', 'VF'),
 ('一目瞭然', 'VH'),
 ('！', 'EXCLAMATIONCATEGORY')]

# Group mean vector

In [24]:
#Use get_bow_vector_with_selection()
group_vectors = {news_class: [] for news_class in all_news_class}
for index, pairs in sorted(train_title_cuts.items()):
    vector = get_bow_vector_with_selection(pairs, word2index)
    news_class = train_classes[index]
    group_vectors[news_class].append(vector)

group_mean_vector = {}
for news_class, vectors in group_vectors.items():
    group_mean_vector[news_class] = np.mean(vectors, axis=0)
group_mean_vector

{'旅遊': array([0., 0., 0., ..., 0., 0., 0.]),
 '科技': array([0., 0., 0., ..., 0., 0., 0.]),
 '財經': array([0., 0., 0., ..., 0., 0., 0.]),
 '農業': array([0., 0., 0., ..., 0., 0., 0.]),
 '遊戲': array([0.        , 0.        , 0.        , ..., 0.00333333, 0.00333333,
        0.00333333]),
 '體育': array([0.04      , 0.00333333, 0.        , ..., 0.        , 0.        ,
        0.        ])}

In [27]:
#Use get_bow_vector()
group_vectors = {news_class: [] for news_class in all_news_class}
for index, pairs in sorted(train_title_cuts.items()):
    vector = get_bow_vector(pairs, word2index)
    news_class = train_classes[index]
    group_vectors[news_class].append(vector)

group_mean_vector = {}
for news_class, vectors in group_vectors.items():
    group_mean_vector[news_class] = np.mean(vectors, axis=0)
group_mean_vector

{'旅遊': array([0.        , 0.        , 0.08666667, ..., 0.        , 0.        ,
        0.        ]),
 '科技': array([0.        , 0.        , 0.07666667, ..., 0.        , 0.        ,
        0.        ]),
 '財經': array([0.        , 0.        , 0.12333333, ..., 0.        , 0.        ,
        0.        ]),
 '農業': array([0.        , 0.        , 0.09333333, ..., 0.        , 0.        ,
        0.        ]),
 '遊戲': array([0.        , 0.        , 0.27      , ..., 0.00333333, 0.00333333,
        0.00333333]),
 '體育': array([0.04      , 0.00333333, 0.15666667, ..., 0.        , 0.        ,
        0.        ])}

# Group mean vector: 測試

In [25]:
#Use get_bow_vector_with_selection()
classification = {news_class: [] for news_class in all_news_class}
for index, pairs in sorted(test_title_cuts.items()):
    vector = get_bow_vector_with_selection(pairs, word2index)
    if np.sum(np.square(vector)) == 0:
        continue

    max_val = -2.0
    max_class = None
    for news_class, ref_vector in group_mean_vector.items():
        val = cosine_similarity(ref_vector, vector)
        if val > max_val:
            max_class = news_class
            max_val = val

    classification[max_class].append(index)

In [28]:
#Use get_bow_vector()
classification = {news_class: [] for news_class in all_news_class}
for index, pairs in sorted(test_title_cuts.items()):
    vector = get_bow_vector(pairs, word2index)
    if np.sum(np.square(vector)) == 0:
        continue

    max_val = -2.0
    max_class = None
    for news_class, ref_vector in group_mean_vector.items():
        val = cosine_similarity(ref_vector, vector)
        if val > max_val:
            max_class = news_class
            max_val = val

    classification[max_class].append(index)

In [26]:
from collections import Counter
print('Use get_bow_vector_with_selection()')
for group, ids in classification.items():
    counter = Counter([test_classes[id] for id in ids])
    print('predict', group, ': ', counter)

Use get_bow_vector_with_selection()
predict 體育 :  Counter({'體育': 85, '遊戲': 8, '農業': 5, '科技': 4, '旅遊': 4, '財經': 2})
predict 財經 :  Counter({'財經': 74, '科技': 18, '農業': 8, '旅遊': 5, '體育': 3, '遊戲': 2})
predict 科技 :  Counter({'科技': 65, '財經': 14, '旅遊': 5, '農業': 5, '體育': 4, '遊戲': 4})
predict 旅遊 :  Counter({'旅遊': 74, '農業': 5, '財經': 2, '科技': 1, '遊戲': 1})
predict 農業 :  Counter({'農業': 72, '旅遊': 6, '財經': 5, '科技': 5, '遊戲': 4, '體育': 1})
predict 遊戲 :  Counter({'遊戲': 81, '科技': 6, '體育': 5, '旅遊': 5, '財經': 2, '農業': 2})


In [29]:
from collections import Counter
print('Use get_bow_vector()')
for group, ids in classification.items():
    counter = Counter([test_classes[id] for id in ids])
    print('predict', group, ': ', counter)

Use get_bow_vector()
predict 體育 :  Counter({'體育': 53, '財經': 16, '農業': 13, '旅遊': 11, '科技': 10, '遊戲': 8})
predict 財經 :  Counter({'財經': 27, '科技': 7, '體育': 6, '農業': 6, '旅遊': 4, '遊戲': 2})
predict 科技 :  Counter({'科技': 61, '旅遊': 29, '財經': 26, '農業': 23, '遊戲': 22, '體育': 19})
predict 旅遊 :  Counter({'旅遊': 41, '農業': 12, '財經': 11, '遊戲': 9, '體育': 4, '科技': 3})
predict 農業 :  Counter({'農業': 42, '科技': 13, '遊戲': 10, '體育': 9, '財經': 9, '旅遊': 9})
predict 遊戲 :  Counter({'遊戲': 49, '財經': 11, '體育': 8, '旅遊': 6, '科技': 5, '農業': 4})
