In [1]:
import numpy as np
import nltk
from nltk import wordpunct_tokenize
import pandas as pd
#punctuation remove setting
import string
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine

#建立類別
class WordVector():
    def __init__(self, filename, window=2, dim=10):
        self.filename = filename
        self.window = window
        self.dim = dim

    def q21_cooccur_matrix(self):
        '''
        Arguments
            filename (str): the filename of an English article
            window (int): context window, define the range of the context
        Returns
            vocab (dict): map keyword into id
            inv_vocab (dict): map id back to keyword
            cooccur_matrix (np.ndarray): NxN co-occurrence matrix
        '''
      
        df = pd.read_table(self.filename,header =None)
        #全部變為小寫
        li_n=list()
        for line in df[0]:
        
            li_n.append(line.lower())
        allword = list()
        for article in li_n:
            punct_token = wordpunct_tokenize(article)
            #remove string.punctuation
            punct_token = [word for word in punct_token if word not in string.punctuation]
            allword.append(punct_token)
        #建一個list來放
        count = list()
        for words in allword:
            for i in range(len(words)-1):
                for j in range(i-self.window,i+1+self.window):
                    if (j<0 or j==i) :
                        continue
                    elif j >len(words)-1:
                        break
                    else:
                        count.append(([words[i],words[j]]))
        vocab = {}
        i=0
        for words in count:
            for word in words:
                if word not in vocab: 
                    vocab[word] = i
                    i+=1
        
        inv_vocab = {v: k for k, v in vocab.items()}
        #建立一個全0的array
        cooccur_matrix = np.zeros([len(vocab),len(vocab)])
        
        for sets in count:
            cooccur_matrix[vocab[sets[0]],vocab[sets[1]]]+=1
            cooccur_matrix[vocab[sets[1]],vocab[sets[0]]]+=1
        
    
    
        return vocab, inv_vocab, cooccur_matrix

    
    

    def q22_word_vectors(self):
        '''
        Arguments
            cooccur_matrix (np.ndarray): NxN co-occurrence matrix
            dim (int): dimension of PCA
        Returns
            word_vector (np.ndarray): Nxdim word-vector matrix
        '''
        cooccur_matrix =self.q21_cooccur_matrix()[2]
        
        # Use PCA to reduce dimension
        pca=PCA(n_components=self.dim)
        word_vector=pca.fit_transform(cooccur_matrix)
        return word_vector
        
    

    def q23_similarity(self,word):
        '''
        Arguments
            word (str): input keyword
            word_vectors (np.ndarray): Nxdim word-vector matrix
            vocab (dict): map keyword into id
            inv_vocab (dict): map id back to keyword
        Returns
            top3: list of 3 tuple, each tuple consists of (word, similarity)
        '''
        self.word = word
        word_vector = self.q22_word_vectors()
        vocab , inv_vocab =self.q21_cooccur_matrix()[0:2]
        #字詞間的相似度
        dist_out = 1-pairwise_distances(word_vector, metric="cosine")
        #透過argsort找到array排序的前三的位置, 有負號是表示降幕,[1:4],是因為位置0是自己
        pos = np.argsort(-dist_out[vocab[self.word]])[1:4]
        top3 = list([(inv_vocab[pos[0]],dist_out[vocab[self.word],pos[0]]),
                (inv_vocab[pos[1]],dist_out[vocab[self.word],pos[1]]),
                (inv_vocab[pos[2]],dist_out[vocab[self.word],pos[2]])])
        return top3
    
    def most_similar(self,word):
        self.word = word
        top3 = self.q23_similarity(self.word)
        return top3
    
    

        




In [2]:
wv = WordVector('raw_sentences.txt')
for word, sim in wv.most_similar('office'):
    print(word, sim)


group 0.99987421894631
director 0.9998301731167207
center 0.9998272397288703
