# 第9章: ベクトル空間法 (I)

In [1]:
%system curl -O 'http://www.cl.ecei.tohoku.ac.jp/nlp100/data/enwiki-20150112-400-r100-10576.txt.bz2'

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 ' 40 21.0M   40 8605k    0     0  9555k      0  0:00:02 --:--:--  0:00:02 9551k',
 ' 92 21.0M   92 19.3M    0     0  10.1M      0  0:00:02  0:00:01  0:00:01 10.1M',
 '100 21.0M  100 21.0M    0     0  10.1M      0  0:00:02  0:00:02 --:--:-- 10.1M']

## 80. コーパスの整形

In [5]:
import bz2

sentences = []
for line in bz2.BZ2File('enwiki-20150112-400-r100-10576.txt.bz2'):
    sentences.append(' '.join([token.strip('.,!?;:()[]\'"').strip() for token in line.decode('utf8')[:-1].split(' ') if token.strip('.,!?;:()[]\'"').strip()]))

In [7]:
with open('corpus80.txt', 'w') as fd:
    for sentence in sentences:
        if sentence:
            fd.write(sentence + '\n')

## 81. 複合語からなる国名への対処

In [8]:
import pandas as pd


url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'
dataframe = pd.io.html.read_html(url)

In [21]:
country_names = {}
for i in dataframe[0][4:-2][0]:
    if i.startswith('ZZZ'):
        continue
    if '\xa0– ' in i:
        (country_name, long_country_name) = i.split('\xa0– ')
        country_names[country_name.split()[0]] = country_name
        country_names[long_country_name.split()[0]] = long_country_name
    else:
        country_names[i.split()[0]] = i
country_names['United'] = 'United States'

In [24]:
sentences = []

with open('corpus80.txt') as fd:
    for tokens in fd:
        new_tokens = []
        for (i, token) in enumerate(tokens.split()):
            if token in country_names:
                cn = country_names[token]
                if cn == tokens[i:i+len(cn)]:
                    cn = cn.replace(' ', '_')
                    new_tokens.append(cn)
            else:
                new_tokens.append(token)
        sentences.append(new_tokens)

## 82. 文脈の抽出

In [25]:
import random

with open('corpus82.txt', 'w') as fd:
    for sentence in sentences:
        for j in range(len(sentence)):
            t = sentence[j]
            d = random.randint(1, 5)
            for k in range(max(j - d, 0), min(j + d + 1, len(sentence))):
                if j != k:
                    fd.write('%s\t%s\n' % (t, sentence[k]))

## 83. 単語／文脈の頻度の計測

In [21]:
from collections import Counter

work_tc = []
work_t = []
work_c = []
counter_tc = Counter()
counter_t = Counter()
counter_c = Counter()

with open('corpus82.txt') as fd:
    for (i, line) in enumerate(fd, start=1):

        line = line.strip()
        tokens = line.split('\t')

        work_tc.append(line)
        work_t.append(tokens[0])
        work_c.append(tokens[1])

        if i % 1000000 == 0:
            counter_tc.update(work_tc)
            counter_t.update(work_t)
            counter_c.update(work_c)
            work_tc = []
            work_t = []
            work_c = []

    counter_tc.update(work_tc)
    counter_t.update(work_t)
    counter_c.update(work_c)

del work_tc, work_t, work_c

In [22]:
N = sum(counter_tc.values())
N

67519664

## 84. 単語文脈行列の作成

In [24]:
import math
import pickle
from collections import OrderedDict
from scipy import sparse

dict_index_t = OrderedDict((key, i) for i, key in enumerate(counter_t.keys()))
dict_index_c = OrderedDict((key, i) for i, key in enumerate(counter_c.keys()))

size_t = len(dict_index_t)
size_c = len(dict_index_c)
matrix_x = sparse.lil_matrix((size_t, size_c))

def calc_ppmi(N, f_tc, t, c):
    if f_tc < 10:
        return 0
    return max(math.log((N * f_tc) / (counter_t[t] * counter_c[c])), 0)

for (k, f_tc) in counter_tc.items():
    if f_tc >= 10:
        (t, c) = k.split('\t')
        ppmi = calc_ppmi(N, f_tc, t, c)
        if ppmi > 0:
            matrix_x[dict_index_t[t], dict_index_c[c]] = ppmi


with open('dict_index_t.pkl', 'wb') as data_file:
    pickle.dump(dict_index_t, data_file)
del counter_tc, counter_t, counter_c

In [6]:
matrix_x

<383413x383413 sparse matrix of type '<class 'numpy.float64'>'
	with 444172 stored elements in LInked List format>

## 85. 主成分分析による次元圧縮

In [8]:
import sklearn.decomposition
from scipy import io

clf = sklearn.decomposition.TruncatedSVD(300)
matrix_x300 = clf.fit_transform(matrix_x)
io.savemat('matrix_x300', {'matrix_x300': matrix_x300})
matrix_x300

array([[  1.38399554e-13,   2.80623862e-13,  -6.93487765e-13, ...,
          1.48299369e-12,  -4.60395097e-12,   2.09304182e-12],
       [  1.07918219e+01,  -9.79396744e+00,   1.39427786e+01, ...,
         -2.06871316e-01,   1.52703964e-01,  -7.67869430e-02],
       [  1.33383104e+01,  -1.10493041e+01,   2.16714974e+01, ...,
          4.74495582e-02,   7.17963788e-02,  -4.22875343e-02],
       ..., 
       [  0.00000000e+00,  -0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00],
       [  0.00000000e+00,  -0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00],
       [  0.00000000e+00,  -0.00000000e+00,   0.00000000e+00, ...,
          0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00]])

## 86. 単語ベクトルの表示

In [9]:
print(matrix_x300[dict_index_t['United_States']])

[ 0.17591139 -0.04139807 -0.02054507  0.47802428  0.28762109 -0.14161546
 -0.25939585  0.12568658  0.04667047 -0.01762385  0.00440574  0.23515148
  0.06984147  0.36296715  0.1945541   0.38576479  0.04448114 -0.50779538
 -0.00533639  0.07326178  0.0331656   0.12731005  0.03327558  0.0320777
 -0.00250625  0.04143549 -0.04168525  0.27192375 -0.28981559  0.65676299
  0.08531146  0.06546287  0.04940578 -0.02037115  0.05607366 -0.0011302
  0.52342221  0.22832209  0.52550436  0.04312807  0.17397772 -0.33513058
  0.20112464 -0.03150447  0.19837785  0.4667724   0.03302099 -0.1628859
  0.0663168  -0.13928939  0.01789095 -0.17317036 -0.08631318 -0.01488529
  0.06342893  0.12190213 -0.01892328  0.00207561 -0.05805681  0.05262019
 -0.05886119 -0.14166798  0.11959298  0.39347617  0.40175756  0.22269925
  0.35954049 -0.10378227 -0.20258596  0.12939298 -0.14465286  0.2667883
 -0.08647199 -0.27886252 -0.13528828  0.19597011 -0.02525106  0.10937388
 -0.11690274  0.09950905 -0.61639285 -0.31738436  0.431

## 87. 単語の類似度

In [19]:
import numpy as np

def cos_sim(a, b):
    norm = np.linalg.norm(a) * np.linalg.norm(b)
    if norm != 0:
        return np.dot(a, b) / norm
    else:
        return -1

cos_sim(matrix_x300[dict_index_t['United_States']], matrix_x300[dict_index_t['U.S']])

0.30363386324630748

## 88. 類似度の高い単語10件

In [33]:
import numpy as np

def cos_sim(a, b):
    norm = np.linalg.norm(a) * np.linalg.norm(b)
    if norm != 0:
        return np.dot(a, b) / norm
    else:
        return -1

vec_england = matrix_x300[dict_index_t['England']]
distances = [cos_sim(vec_england, matrix_x300[i]) for i in range(0, len(dict_index_t))]

index_sorted = np.argsort(distances)
keys = list(dict_index_t.keys())
for index in index_sorted[-2:-12:-1]:
    print('%s\t%s' % (keys[index], distances[index]))

Scotland	0.659968292937
Wales	0.600579664717
Ireland	0.498700869599
Patriots	0.494463110596
Africa	0.484252012028
Cheshire	0.470073555068
Zealand	0.46144041119
America	0.432902417873
Europe	0.428471506011
coast	0.418348492945


## 89. 加法構成性によるアナロジー

In [34]:
vec = matrix_x300[dict_index_t['Spain']] - matrix_x300[dict_index_t['Madrid']] + matrix_x300[dict_index_t['Athens']]
distances = [cos_sim(vec, matrix_x300[i]) for i in range(len(dict_index_t))]

index_sorted = np.argsort(distances)
keys = list(dict_index_t.keys())
for index in index_sorted[:-11:-1]:
    print('%s\t%s' % (keys[index], distances[index]))

Freiburg	0.57023386464
Tezpur	0.570186819117
Leiden	0.570126297876
McGill	0.569098322224
Nehru	0.56906406981
EMUNI	0.568647371521
Loyola	0.566776913062
Peking	0.565936583607
Yeshiva	0.565890244762
Emory	0.56575074229
