In [1]:
import re, random
import numpy as np

In [2]:
with open('kerajaan','r') as fopen:
    kerajaan = list(filter(None, fopen.read().split('\n')))

In [3]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

kerajaan = [clearstring(i) for i in kerajaan]

In [5]:
def penalty(M, mu):
    return np.where(M>=mu,0, np.min(M - mu, 0))

def grads(M, W, H, lam, mu):
    R = W.dot(H) - M
    return R.dot(H.T) + penalty(W, mu)*lam, W.T.dot(R) + penalty(H, mu)*lam

def upd(M, W, H, lr, lam, mu):
    dW,dH = grads(M,W,H,lam,mu)
    W -= lr*dW
    H -= lr*dH
    
def tfidf(corpus):
    vocabulary = list(set(' '.join(corpus).split()))
    idf = {}
    for i in vocabulary:
        idf[i] = 0
        for k in corpus:
            if i in k.split():
                idf[i] += 1
        idf[i] = np.log(idf[i] / len(corpus))
    tfidf = np.zeros((len(corpus),len(vocabulary)))
    for no, i in enumerate(corpus):
        for text in i.split():
            tfidf[no, vocabulary.index(text)] += 1
        for text in i.split():
            tfidf[no, vocabulary.index(text)] = tfidf[no, vocabulary.index(text)] * idf[text]
    return vocabulary, tfidf

def bow(corpus):
    vocabulary = list(set(' '.join(corpus).split()))
    bow = np.zeros((len(corpus),len(vocabulary)))
    for no, i in enumerate(corpus):
        for text in i.split():
            bow[no, vocabulary.index(text)] += 1
    return vocabulary, bow

In [7]:
def show_topics(corpus, count=10, k_words=10, use_tfidf=True, penalty=1e-6, learning_rate=1e-6,
               lam=1e3,epoch=50):
    if use_tfidf:
        vocab, vectors = tfidf(corpus)
    else:
        vocab, vectors = bow(corpus)
    print('vectors shape:',vectors.shape)
    m, n = vectors.shape
    W = np.abs(np.random.normal(scale=0.01, size=(m,count)))
    H = np.abs(np.random.normal(scale=0.01, size=(count,n)))
    for i in range(epoch):
        upd(vectors,W,H,learning_rate,lam,penalty)
        print('epoch:',i, W.min(), H.min())
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-k_words-1:-1]]
    topic_words = ([top_words(t) for t in H])
    return [' '.join(t) for t in topic_words]

In [8]:
show_topics(kerajaan)

vectors shape: (6957, 16212)
epoch: 0 -4.1044924381583935e-07 -2.90772193167458e-06
epoch: 1 -7.8695576390063e-06 -6.253696457086038e-06
epoch: 2 -3.4228037418379374e-05 -9.596089298466051e-06
epoch: 3 -6.055980410656732e-05 -1.3333561301495036e-05
epoch: 4 -8.686499916818119e-05 -2.0944640100188828e-05
epoch: 5 -0.00011314376380983153 -2.8547641497971757e-05
epoch: 6 -0.00013939623898103027 -3.6142573639530124e-05
epoch: 7 -0.00016562256537509216 -4.3729444662373384e-05
epoch: 8 -0.0001918228834300349 -5.130826269686379e-05
epoch: 9 -0.00021799733332947895 -5.887903586627732e-05
epoch: 10 -0.0002441460550035473 -6.64417722868597e-05
epoch: 11 -0.00027026918812976373 -7.39964800678866e-05
epoch: 12 -0.0002963668721339562 -8.154316731171842e-05
epoch: 13 -0.00032243924619115133 -8.908184211389151e-05
epoch: 14 -0.0003484864492264663 -9.661251256313463e-05
epoch: 15 -0.0003745086199160063 -0.00010413518674145421
epoch: 16 -0.0004005058966877604 -0.00011164987272422302
epoch: 17 -0.000426

['jpm acucomei tilawah aadkkedah singapore mnjadi matches gerus support sukacita',
 'yef2016 chill bersinar saudara2 episod harmed rm26j pu3uampang bnm blake',
 'cth peneraju two pemandngan fasal meru witnessing petrol tersebut triples',
 'theyve buloh merbokjaguar nasionalfm wawrinka kepada harithiskander exert providing rencana',
 'bistro prktanjongdatu betul ringankan pilatus pergigian february bakul ayaq rai',
 'airforcenextgen peneraju beijingtianjin gulai perkuburan jwtn ummah sibu dvm funding',
 'peneraju cleanliness stiap ahead lembu pangkalan beserta computers insan denyutan',
 'almari 754 cities programs 132 penyelenggaraan penerangan totally vivekananda simptom',
 'declaration 012017 akademi jkpd card perlis ditetapkan 4852 dipandu sai',
 'padi tudung didahului herman mbm jannahhishammuddinh2o tapah hartanah sebat jadilah']