In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Current directory
print(os.getcwd())

# change directory
os.chdir('/content/drive/MyDrive/python_training/NLP100Days/project_2_news_classf_3_PPMI_SVD')
print(os.getcwd())

/content
/content/drive/MyDrive/python_training/NLP100Days/project_2_news_classf_3_PPMI_SVD


In [3]:
!python --version

Python 3.7.10


In [4]:
!pip install -q -U pip
!pip install -q numpy
!pip install -q pandas
!pip install -q ckiptagger
!pip install -q tqdm
!pip install -q tensorflow==1.14.0
!pip install -q ipywidgets
!pip install -q matplotlib

[K     |████████████████████████████████| 1.5MB 7.7MB/s 
[K     |████████████████████████████████| 109.3 MB 37 kB/s 
[K     |████████████████████████████████| 50 kB 5.9 MB/s 
[K     |████████████████████████████████| 3.1 MB 60.2 MB/s 
[K     |████████████████████████████████| 488 kB 50.5 MB/s 
[?25h

In [5]:
import pandas as pd
import numpy as np

from ckiptagger import WS, POS
from tqdm.notebook import tqdm

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
!ls

news_clustering_test.tsv   project_2_分類類_PPMI_SVD_hw.ipynb
news_clustering_train.tsv  分類器：PPMI＋SVD_hw_ans.ipynb


In [7]:
df_train = pd.read_csv('news_clustering_train.tsv', sep='\t')
df_test = pd.read_csv('news_clustering_test.tsv', sep='\t')

In [8]:
train_titles = {row['index']: row['title'] for _, row in df_train.iterrows()}
train_classes = {row['index']: row['class'] for _, row in df_train.iterrows()}

test_titles = {row['index']: row['title'] for _, row in df_test.iterrows()}
test_classes = {row['index']: row['class'] for _, row in df_test.iterrows()}

In [9]:
all_news_class = ['體育', '財經', '科技', '旅遊', '農業', '遊戲']

# 斷詞 + POS

In [10]:
ws = WS('../ckip_data/')
pos = POS('../ckip_data/')



In [11]:
train_title_cuts = {}
for index, title in tqdm(train_titles.items()):
    word_s = ws([title])
    word_p = pos(word_s)
    train_title_cuts[index] = list(zip(word_s[0], word_p[0]))

HBox(children=(FloatProgress(value=0.0, max=1800.0), HTML(value='')))




In [12]:
test_title_cuts = {}
for index, title in tqdm(test_titles.items()):
    word_s = ws([title])
    word_p = pos(word_s)
    test_title_cuts[index] = list(zip(word_s[0], word_p[0]))

HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))




# 尋找降維的詞向量：PPMI + SVD

In [13]:
word2index = {}
index2word = {}
n = 0
for index in train_title_cuts: ## train_title_cuts: index,word,flag
    for word, flag in train_title_cuts[index]:
        if word in word2index:
            continue
        word2index[word] = n 
        index2word[n] = word
        n += 1

In [14]:
len(word2index)

6690

6690

train_title_cuts:
{0: [('亞洲杯', 'Nb'),
  ('奪', 'VC'),
  ('冠賠率', 'Na'),
  ('：', 'COLONCATEGORY'),
  ('日本', 'Nc'),
  ('、', 'PAUSECATEGORY'),
  ('伊朗', 'Nc'),
  ('領銜', 'VB'),
  (' ', 'WHITESPACE'),
  ('中國', 'Nc'),
  ('竟', 'D'),
  ('與', 'P'),
  ('泰國', 'Nc'),
  ('並列', 'VG')],

 1: [('9', 'Neu'),

如果使用one-hot就需要這麼大的維度的詞向量

In [15]:
# 建立Co-Matrix

vocab_size = len(word2index)
co_matrix = np.zeros(shape=(vocab_size, vocab_size), dtype=np.int32)

window_size = 1
# YOUR CODE HERE

for pairs in train_title_cuts.values():
    words, _ = list(zip(*pairs)) ## word,flag
    for idx, word in enumerate(words):
        left_idx = idx - window_size if idx - window_size >= 0 else 0
        left_words = words[left_idx:idx]

        word_id = word2index[word]
        for left_word in left_words:
            left_id = word2index[left_word]
            co_matrix[word_id, left_id] += 1
            co_matrix[left_id, word_id] += 1

# END YOUR CODE

In [16]:
co_matrix

array([[0, 2, 0, ..., 0, 0, 0],
       [2, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [17]:
# 建立PPMI

def get_ppmi(co_matrix: np.ndarray, eps: float=1e-8):
    # YOUR CODE HERE
    M = np.zeros_like(co_matrix, dtype=np.float32)
    N = np.sum(co_matrix)
    S = np.sum(co_matrix, axis=0)
    total = co_matrix.shape[0]*co_matrix.shape[1]
    cnt = 0
    for i in tqdm(range(co_matrix.shape[0])): #tqdm 進度提示資訊
      for j in range(co_matrix.shape[1]):
        pmi = np.log2(co_matrix[i, j]*N / (S[i]*S[j] + eps))
        M[i, j] = max(0, pmi)

    # END YOUR CODE
    return M

ppmi = get_ppmi(co_matrix)

HBox(children=(FloatProgress(value=0.0, max=6690.0), HTML(value='')))

  if sys.path[0] == '':





In [18]:
ppmi

array([[ 0.      ,  9.267157,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 9.267157,  0.      , 11.515084, ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      , 11.515084,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       ...,
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ],
       [ 0.      ,  0.      ,  0.      , ...,  0.      ,  0.      ,
         0.      ]], dtype=float32)

In [19]:
# 進行SVD分解，並得到降維的詞向量

from sklearn.decomposition import TruncatedSVD

# 使用`TruncatedSVD`進行降維，降維到dim=1000
# YOUR CODE HERE

svd = TruncatedSVD(n_components=1000, n_iter=10)
word_vectors = svd.fit_transform(ppmi)

# END YOUR CODE

In [None]:
word_vectors.shape

(6690, 1000)

# 新的詞向量 + Group mean vector: 測試

In [20]:
excluded_flags = [
    'Nh', 'Nep', 'Nes', 'DE', 'T', 'P', 'V_2', 'SHI',
    'Dfa', 'Dfb', 'Da', 'Di', 'Dk',
    'Caa', 'Cab', 'Cba', 'Cbb',
    'COLONCATEGORY', 'COMMACATEGORY', 'DASHCATEGORY', 'DOTCATEGORY', 'ETCCATEGORY', 'EXCLAMATIONCATEGORY',
    'PARENTHESISCATEGORY', 'PAUSECATEGORY', 'PERIODCATEGORY', 'QUESTIONCATEGORY', 'SEMICOLONCATEGORY',
    'SPCHANGECATEGORY', 'WHITESPACE'
]

In [21]:
train_svd_vectors = {}
for index, pairs in train_title_cuts.items():
    selected_word_vectors = []
    for word, flag in pairs:
        if word in word2index and flag not in excluded_flags:
            selected_word_vectors.append(word_vectors[word2index[word], :])
    vector = np.sum(selected_word_vectors, axis=0)
    if np.sum(np.square(vector)) == 0:
        continue
    train_svd_vectors[index] = vector
    

train_svd_vectors:

{
 0: array([1.72706242e+01, -4.66476154e+00,  4.12554741e+00,  4.98114204e+00,-2.98449516e+00,  2.09208131e-02,  1.26916957e+00,  2.65375018e+00, ....-6.83719516e-01,  3.83606732e-01, -1.95732772e-01, -6.21506833e-02],
       dtype=float32),
 
 1: array([ 2.46403713e+01, -6.52269959e-01,  3.54581356e+00,  4.57131672e+00,


In [22]:
test_svd_vectors = {}
for index, pairs in test_title_cuts.items():
    selected_word_vectors = []
    for word, flag in pairs:
        if word in word2index and flag not in excluded_flags:
            selected_word_vectors.append(word_vectors[word2index[word], :])
    vector = np.sum(selected_word_vectors, axis=0)
    if np.sum(np.square(vector)) == 0:
        continue
    test_svd_vectors[index] = vector

In [23]:
group_vectors = {news_class: [] for news_class in all_news_class}
for index, vector in sorted(train_svd_vectors.items()):
    news_class = train_classes[index]
    group_vectors[news_class].append(vector)

group_mean_vector = {}
for news_class, vectors in group_vectors.items():
    group_mean_vector[news_class] = np.mean(vectors, axis=0)

In [24]:
def cosine_similarity(bow1, bow2):
    len_bow1 = np.sqrt(np.sum(np.square(bow1)))
    len_bow2 = np.sqrt(np.sum(np.square(bow2)))
    return np.sum(bow1 * bow2) / (len_bow1 * len_bow2)

In [25]:
classification = {news_class: [] for news_class in all_news_class}
for index, vector in sorted(test_svd_vectors.items()):
    if np.sum(np.square(vector)) == 0:
        continue

    max_val = -2.0
    max_class = None
    for news_class, ref_vector in group_mean_vector.items():
        val = cosine_similarity(ref_vector, vector)
        if val > max_val:
            max_class = news_class
            max_val = val

    classification[max_class].append(index)

In [26]:
from collections import Counter

for group, ids in classification.items():
    counter = Counter([test_classes[id] for id in ids])
    print('predict', group, ': ', counter)

predict 體育 :  Counter({'體育': 62, '遊戲': 10, '旅遊': 9, '財經': 8, '科技': 5, '農業': 4})
predict 財經 :  Counter({'財經': 62, '科技': 25, '農業': 15, '體育': 8, '遊戲': 8, '旅遊': 7})
predict 科技 :  Counter({'科技': 51, '體育': 15, '財經': 14, '農業': 9, '遊戲': 9, '旅遊': 8})
predict 旅遊 :  Counter({'旅遊': 58, '農業': 12, '科技': 5, '財經': 4, '體育': 2, '遊戲': 2})
predict 農業 :  Counter({'農業': 58, '旅遊': 7, '體育': 4, '財經': 4, '遊戲': 4, '科技': 2})
predict 遊戲 :  Counter({'遊戲': 67, '科技': 11, '旅遊': 9, '財經': 8, '體育': 7, '農業': 1})
