In [1]:
import nltk
import numpy as np
from nltk.corpus import stopwords
import pandas as pd
import math
from scipy.sparse.linalg import svds
import math
from IPython.display import clear_output
import scipy
from sklearn.metrics import pairwise_distances
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hasgrig\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
file = open("wiki-text.txt", "r") 
Text = file.read()
small_text = Text[0:500001]
smaller_text = Text[0:10000]
medium_text = Text[0:2000000]
tiny_text = Text[0:99]


In [3]:
def get_text_vocab(text, min_freq):
    words = text.split(' ')
    while '' in words:
        words.remove('')
    #remove words that occur less than 1000 times    
    word_count = nltk.FreqDist(words)
    word_count = {k:v for (k,v) in word_count.items() if v>min_freq}
    #remove stop words
    stop_words = set(stopwords.words('english'))
    word_count = {k:v for (k,v) in word_count.items() if k not in stop_words}
    return word_count.keys(), words


In [4]:
def get_word_pair_matrix(wordlist, vocab):
    matrix = np.zeros((len(vocab), len(vocab)))
    keys = list(vocab)
    values = list(range(len(vocab)))
    dicti = dict(zip(keys, values))
    for i in range(len(wordlist)):
        if i % 10000 == 0:
            clear_output()
            print(len(wordlist), i)
        for j in range(1,6):
            if i+j<len(wordlist) and (wordlist[i] in vocab) and (wordlist[i+j] in vocab):
                matrix[dicti[wordlist[i]]][dicti[wordlist[i+j]]]+=1
        for j in range(1,6):    
            if i-j>=0 and (wordlist[i] in vocab) and (wordlist[i-j] in vocab):
                matrix[dicti[wordlist[i]]][dicti[wordlist[i-j]]]+=1
        
    return matrix

In [5]:
def get_M(pair_matrix):
    M = np.zeros(pair_matrix.shape)
    NS = np.sum(pair_matrix)
    Nwi_vec = np.sum(pair_matrix, axis=1)
    Nwj_vec = np.sum(pair_matrix, axis=0)
    for i in range(pair_matrix.shape[0]):
        if i%100==0:
            clear_output()
            print(pair_matrix.shape[0], i)
        for j in range(pair_matrix.shape[1]):
            if j>=i:
                Nwij = pair_matrix[i][j]
                Nwi = Nwi_vec[i]
                Nwj = Nwj_vec[j]
                M[i][j] = math.log(((Nwij+1)*NS)/(Nwi*Nwj))
            else:
                M[i][j] = M[j][i]
    return M

In [6]:
%%time
vocab, wordlist = get_text_vocab(Text, 500)

Wall time: 3min 6s


In [7]:
pair_matrix = get_word_pair_matrix(wordlist, vocab)

124301826 124300000


In [8]:
M = get_M(pair_matrix)

13201 13200


In [9]:
U, s, V = svds(scipy.sparse.csr_matrix(M), k=50)
S = np.diag(s)
W = np.matmul(U, S)

In [10]:
keys = list(vocab)
values = list(range(len(vocab)))
dicti = dict(zip(keys, values))
dist_matrix = pairwise_distances(W)

In [11]:
print("The closes words to physics, republican, einstein, algebra, fish are ... \n")
print(np.asarray(list(vocab))[np.argsort(dist_matrix[dicti['physics']])[0:6]])
print(np.asarray(list(vocab))[np.argsort(dist_matrix[dicti['republican']])[0:6]])
print(np.asarray(list(vocab))[np.argsort(dist_matrix[dicti['einstein']])[0:6]])
print(np.asarray(list(vocab))[np.argsort(dist_matrix[dicti['algebra']])[0:6]])
print(np.asarray(list(vocab))[np.argsort(dist_matrix[dicti['fish']])[0:6]])



The closes words to physics, republican, einstein, algebra, fish are ... 

['physics' 'mathematics' 'mathematical' 'sciences' 'analysis' 'quantum']
['republican' 'presidential' 'secretary' 'senator' 'representative'
 'democratic']
['einstein' 'relativity' 'marx' 'astronomical' 'darwin' 'mathematicians']
['algebra' 'finite' 'geometry' 'dimensional' 'notation' 'matrix']
['fish' 'plants' 'wild' 'trees' 'animal' 'plant']


In [12]:
def get_analogies(a1, a2, b1):
    b2 = W[dicti[a2]]-W[dicti[a1]]+W[dicti[b1]]
    dist_dif = W-b2
    inds = np.argsort(np.linalg.norm(dist_dif, axis=1))[0:5]
    return (np.asarray(list(vocab))[inds])

In [13]:
print("man : woman :: boy : ?")
print(get_analogies("man", "woman", "boy"))
print("die : death :: live : ?")
print(get_analogies("die", "death", "live"))
print("mother : father :: daughter : ?")
print(get_analogies("mother", "father", "daughter"))



man : woman :: boy : ?
['girls' 'boys' 'baby' 'broadway' 'cat']
die : death :: live : ?
['children' 'year' 'people' 'day' 'film']
mother : father :: daughter : ?
['son' 'daughter' 'father' 'wife' 'brother']
