In [1]:
import numpy as np
from scipy.linalg import svd, inv
import re, random

In [2]:
with open('kerajaan','r') as fopen:
    kerajaan = list(filter(None, fopen.read().split('\n')))

In [3]:
def clearstring(string):
    string = re.sub('[^A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string.lower()

kerajaan = [clearstring(i) for i in kerajaan]

In [4]:
class LSA:
    def __init__(self, corpus, tfidf=False):
        self.corpus = corpus
        self.vocabulary = list(set(' '.join(self.corpus).split()))
        if tfidf:
            self._tfidf()
        else:
            self._bow()
        self._calc_svd()
        
    def _calc_svd(self):
        self.U, self.S, self.Vt = svd(self.tfidf.T, full_matrices =False)
        
    def _bow(self):
        self.tfidf = np.zeros((len(self.corpus),len(self.vocabulary)))
        for no, i in enumerate(self.corpus):
            for text in i.split():
                self.tfidf[no, self.vocabulary.index(text)] += 1
    
    def _tfidf(self):
        idf = {}
        for i in self.vocabulary:
            idf[i] = 0
            for k in self.corpus:
                if i in k.split():
                    idf[i] += 1
            idf[i] = np.log(idf[i] / len(self.corpus))
        self.tfidf = np.zeros((len(self.corpus),len(self.vocabulary)))
        for no, i in enumerate(self.corpus):
            for text in i.split():
                self.tfidf[no, self.vocabulary.index(text)] += 1
            for text in i.split():
                self.tfidf[no, self.vocabulary.index(text)] = self.tfidf[no, self.vocabulary.index(text)] * idf[text]

In [5]:
def show_topics(corpus, count=10, k_words=10, tfidf=False):
    lsa = LSA(corpus,tfidf=tfidf)
    vectors = lsa.Vt[:count]
    top_words = lambda t: [lsa.vocabulary[i] for i in np.argsort(t)[:-k_words-1:-1]]
    topic_words = ([top_words(t) for t in vectors])
    return [' '.join(t) for t in topic_words]

In [6]:
show_topics(kerajaan)

['wanitaumnomsia saudarafitri sejam ydpa bndr doping isteri dijadikan berkisar menempatkan',
 'wanitaumnomsia sejam ydpa bndr doping saudarafitri isteri berkisar dijadikan menempatkan',
 'brgn mnjlnkn anthem jim hate underwood patutlah unprecedented elias seed',
 'institut 39yo wohoo wajar mengiringi impact onehomeless gapena kedaulatan pipit',
 'queen terhdp hantar salute consist kebrangkatan perbadanan elektrik pertikaian dayabumi',
 'bombardierdauphin global car barangan belibelah modul armizanameer kjg rachel betterpeacekeeping',
 'sekecil ketibaan penguasaan practices bomba bajet2017 duli menyerikan belasah sachs',
 'terhdp munliv menyempurnakan dilarikan menerajuiperubahan pandan themthis pendekar fiskal berpendidikan',
 'kritikal haizzz theirs pjm appreciation dahulu malaysiansunitedrun2017 diberikan seorng boarding',
 'seringgit jomcny list cleaner sidek waterfront hamidi thorough bpmonline hassanrouhani']