In [1]:
import re, random
import numpy as np

In [2]:
with open('kerajaan','r') as fopen:
    kerajaan = list(filter(None, fopen.read().split('\n')))

In [3]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

kerajaan = [clearstring(i) for i in kerajaan]

In [4]:
def penalty(M, mu):
    return np.where(M>=mu,0, np.min(M - mu, 0))

def grads(M, W, H, lam, mu):
    R = W.dot(H) - M
    return R.dot(H.T) + penalty(W, mu)*lam, W.T.dot(R) + penalty(H, mu)*lam

def upd(M, W, H, lr, lam, mu):
    dW,dH = grads(M,W,H,lam,mu)
    W -= lr*dW
    H -= lr*dH
    
def tfidf(corpus):
    vocabulary = list(set(' '.join(corpus).split()))
    idf = {}
    for i in vocabulary:
        idf[i] = 0
        for k in corpus:
            if i in k.split():
                idf[i] += 1
        idf[i] = np.log(idf[i] / len(corpus))
    tfidf = np.zeros((len(corpus),len(vocabulary)))
    for no, i in enumerate(corpus):
        for text in i.split():
            tfidf[no, vocabulary.index(text)] += 1
        for text in i.split():
            tfidf[no, vocabulary.index(text)] = tfidf[no, vocabulary.index(text)] * idf[text]
    return vocabulary, tfidf

def bow(corpus):
    vocabulary = list(set(' '.join(corpus).split()))
    bow = np.zeros((len(corpus),len(vocabulary)))
    for no, i in enumerate(corpus):
        for text in i.split():
            bow[no, vocabulary.index(text)] += 1
    return vocabulary, bow

In [5]:
def find_sentences(keyword, corpus):
    d = []
    for content in [i for i in corpus if i.find(keyword)>=0]:
        a = content.split()
        d.append(a)
    return ' '.join([j for i in d for j in i if re.match("^[a-zA-Z_-]*$", j) and len(j) > 1])

In [15]:
def compare(string1, string2, corpus, use_tfidf=True, epoch=50, learning_rate=1e-6, lam=1e3, penalty=1e-6):
    queries = [find_sentences(string1, corpus), find_sentences(string2, corpus)]
    if use_tfidf:
        vocab, vectors = tfidf(queries)
    else:
        vocab, vectors = bow(queries)
    m, n = vectors.shape
    W = np.abs(np.random.normal(scale=0.01, size=(m,2)))
    H = np.abs(np.random.normal(scale=0.01, size=(2,n)))
    for i in range(epoch):
        upd(vectors,W,H,learning_rate,lam,penalty)
    a=W.dot(H)
    angles=np.arccos(np.dot(a[0,:],a[1:].T) / (np.linalg.norm(a[0,:],2)* np.linalg.norm(a[1:],2)))
    return np.abs(1 - float(angles[0])/float(np.pi/2))

In [16]:
compare('kedah', 'kedah', kerajaan)

0.896504454896407

In [17]:
compare('kedah', 'dap', kerajaan)

0.923361608472873

In [18]:
compare('kedah', 'bn', kerajaan)

0.7958667334387592