In [1]:
import numpy as np
from scipy.linalg import svd, inv
import re, random

In [2]:
with open('kerajaan','r') as fopen:
    kerajaan = list(filter(None, fopen.read().split('\n')))

In [3]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

kerajaan = [clearstring(i) for i in kerajaan]

In [4]:
class LSA:
    def __init__(self, corpus, tfidf=False):
        self.corpus = corpus
        self.vocabulary = list(set(' '.join(self.corpus).split()))
        if tfidf:
            self._tfidf()
        else:
            self._bow()
        self._calc_svd()
        
    def _calc_svd(self):
        self.U, self.S, self.Vt = svd(self.tfidf.T, full_matrices =False)
        
    def _bow(self):
        self.tfidf = np.zeros((len(self.corpus),len(self.vocabulary)))
        for no, i in enumerate(self.corpus):
            for text in i.split():
                self.tfidf[no, self.vocabulary.index(text)] += 1
    
    def _tfidf(self):
        idf = {}
        for i in self.vocabulary:
            idf[i] = 0
            for k in self.corpus:
                if i in k.split():
                    idf[i] += 1
            idf[i] = np.log(idf[i] / len(self.corpus))
        self.tfidf = np.zeros((len(self.corpus),len(self.vocabulary)))
        for no, i in enumerate(self.corpus):
            for text in i.split():
                self.tfidf[no, self.vocabulary.index(text)] += 1
            for text in i.split():
                self.tfidf[no, self.vocabulary.index(text)] = self.tfidf[no, self.vocabulary.index(text)] * idf[text]

In [5]:
def find_sentences(keyword, corpus):
    d = []
    for content in [i for i in corpus if i.find(keyword)>=0]:
        a = content.split()
        d.append(a)
    return ' '.join([j for i in d for j in i if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1])

def compare(string1, string2, corpus, tfidf=False):
    queries = [find_sentences(string1, corpus), find_sentences(string2, corpus)]
    lsa = LSA(queries,tfidf=tfidf)
    Vt = lsa.Vt
    S = np.diag(lsa.S)
    vectors =[(np.dot(S,Vt[:,0]), np.dot(S,Vt[:,i])) for i in range(len(Vt))]
    angles = [np.arccos(np.dot(a,b) / (np.linalg.norm(a,2)* np.linalg.norm(b,2))) for a,b in vectors[1:]]
    return np.abs(1 - float(angles[0])/float(np.pi/2))

In [6]:
compare('kedah', 'kedah', kerajaan)

1.0

In [7]:
compare('kedah', 'dap', kerajaan)

0.18372139960335687