In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np
import sklearn.datasets
import re

In [2]:
def clearstring(string):
    string = re.sub('[^\'\"A-Za-z0-9 ]+', '', string)
    string = string.split(' ')
    string = filter(None, string)
    string = [y.strip() for y in string]
    string = ' '.join(string)
    return string

def separate_dataset(trainset):
    datastring = []
    datatarget = []
    for i in range(len(trainset.data)):
        data_ = trainset.data[i].split('\n')
        data_ = list(filter(None, data_))
        for n in range(len(data_)):
            data_[n] = clearstring(data_[n])
        datastring += data_
        for n in range(len(data_)):
            datatarget.append(trainset.target[i])
    return datastring, datatarget

In [3]:
trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')
trainset.data, trainset.target = separate_dataset(trainset)
print (trainset.target_names)
print (len(trainset.data))
print (len(trainset.target))

['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']
416809
416809


In [4]:
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words = 'english')
tfidf = tfidf_vectorizer.fit_transform(trainset.data)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

In [6]:
tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, stop_words='english')
tf = tf_vectorizer.fit_transform(trainset.data)
tf_feature_names = tf_vectorizer.get_feature_names()

In [9]:
nmf = NMF(n_components=5, random_state = 1, alpha =.1, l1_ratio=.5, init = 'nndsvd').fit(tfidf)

In [8]:
lda = LatentDirichletAllocation(n_topics=5, max_iter = 5, learning_method = 'online', learning_offset=50., random_state=0).fit(tf)



In [10]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx , topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))

In [11]:
display_topics(nmf, tfidf_feature_names, 20)

Topic 0:
feel know want time people don think life make dont way love little things need help did happy good didnt
Topic 1:
feeling little bit ive pretty today left time day remember quite lately overwhelmed woke know stressed think started good got
Topic 2:
im sure right going feeling starting today generous doing ill tired glad dont trying happy pretty getting sorry moment mellow
Topic 3:
like feel feels ive person punished missed people hated dont supporting doing fake ve boring life going things felt tortured
Topic 4:
just really want dont know feels right feelings guess think weird didnt wanted maybe thinking need say wish bad talk


In [12]:
display_topics(lda, tf_feature_names, 20)

Topic 0:
feel feeling love days world times home guilty just pain day heart went know horrible homesick awful blog woke sleep
Topic 1:
feel like just im really ive time life make things want people pretty think dont help need doing person feels
Topic 2:
feel feeling like right sad hate lonely emotional tired night wanted miserable unhappy missed terrible place guess crappy god felt
Topic 3:
feeling im little bit just today really left quite stressed started going look sure try day morning got shitty groggy
Topic 4:
feel know way feelings don feeling think ive time people want hurt dont depressed hated sorry long come week angry
