In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import sklearn
from gensim.models import ldamodel
import gensim.corpora
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
import pickle

In [2]:
with open('data/stop_words.txt', 'r') as f:
    stopwords = [line.strip() for line in f]
stopwords += [' ', '的', '是', ',', '。', '_', '-', '、', '，', '“', '”', '.', '》', '《', '']

In [3]:
data = []
with open('data/n_output.txt', 'r') as f:
    for line in f:
        text = [x.strip() for x in line.split(' ') if x not in stopwords]
        data.append(text)

In [18]:
num_topics = 10

In [19]:
id2word = gensim.corpora.Dictionary(data)
corpus = [id2word.doc2bow(text) for text in data]
lda = ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

In [20]:
def get_lda_topics(model, num_topics):
    word_dict = {}
    for i in range(num_topics):
        words = model.show_topic(i, topn = 20)
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = [i[0] for i in words]
    return pd.DataFrame(word_dict)

In [21]:
get_lda_topics(lda, num_topics)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,～,。,了,？,喜欢,dw,—,…,=,觉得
1,爱,粉,rs,说,吗,xw,！,-,~,说
2,[,楼,蓝海,黑,会,。,;,:,哈哈哈,。
3,],惹,啊,踩,dxw,说,&#,转,【,的
4,·,快,太,麻袋,唱,没有,&,转发,】,感觉
5,（,wf,真的,掐,想,一个,@,http,48,没
6,）,红,声音,水产,说,现在,年,唯饭,DW,挺
7,！,肝,真是,属性,？,粉丝,好听,美帝,咯,喜欢
8,么,反正,恶心,买,听,公司,期待,rr,图片,真的
9,你家,镜头,死,贴,low,没,肝肝,唯,截图,视频


In [22]:
train_sentences = [' '.join(text) for text in data]

In [23]:
vectorizer = CountVectorizer(analyzer='word', max_features=5000)
x_counts = vectorizer.fit_transform(train_sentences)

In [24]:
transformer = TfidfTransformer(smooth_idf=False)
x_tfidf = transformer.fit_transform(x_counts)

In [25]:
xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)

In [26]:
#obtain a NMF model.
model = NMF(n_components=num_topics, init='nndsvd');
#fit the model
model.fit(xtfidf_norm)

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0,
  max_iter=200, n_components=10, random_state=None, shuffle=False,
  solver='cd', tol=0.0001, verbose=0)

In [27]:
def get_nmf_topics(model, n_top_words):
    
    #the word ids obtained need to be reverse-mapped to the words so we can print the topic names.
    feat_names = vectorizer.get_feature_names()
    
    word_dict = {};
    for i in range(num_topics):
        
        #for each topic, obtain the largest values, and add the words they map to into the dictionary.
        words_ids = model.components_[i].argsort()[:-20 - 1:-1]
        words = [feat_names[key] for key in words_ids]
        word_dict['Topic # ' + '{:02d}'.format(i+1)] = words;
    
    return pd.DataFrame(word_dict);

In [28]:
get_nmf_topics(model, 20)

Unnamed: 0,Topic # 01,Topic # 02,Topic # 03,Topic # 04,Topic # 05,Topic # 06,Topic # 07,Topic # 08,Topic # 09,Topic # 10
0,哈哈哈,xw,图片,真的,dw,喜欢,一个,好看,没有,哈哈哈哈
1,哈哈,gg,一下,觉得,gg,感觉,知道,觉得,楼主,知道
2,可爱,可爱,看到,知道,知道,觉得,觉得,今天,知道,看到
3,卧槽,觉得,可怕,感觉,觉得,知道,楼主,衣服,现在,可爱
4,搞笑,感觉,知道,现在,现在,这种,看到,dxw,觉得,楼主
5,好笑,今天,微博,gg,今天,楼主,现在,感觉,今天,卧槽
6,尴尬,看到,今天,看到,感觉,现在,不会,gg,感觉,搞笑
7,xswl,知道,真是,不要,不要,gg,不要,现在,不要,好好笑
8,妈呀,真是,这是,可爱,看到,特别,感觉,长得,看到,好像
9,真是,应该,感觉,不会,不会,这张,两个,眼睛,gg,真是
