In [1]:
# Imports

import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./data/cleaned_text.csv')
print('Dataframe has {} rows and {} columns.'.format(df.shape[0], 
                                                     df.shape[1]))
df.head(5)

Dataframe has 1403 rows and 1 columns.


Unnamed: 0,text
0,try find Qualia Mind sleep Oura ring mind shar...
1,food app ability retrieve nutritional info tex...
2,audiovideo record life capture bit obtrusive t...
3,major privacy concern little actual benefit pr...
4,introductory stat know study design affect con...


In [3]:
print("Checking for null values...\n{}\n".format(df.isnull().sum()))
print('Checking how many empty string values in text column of dataframe...\n{}'.format(df['text'][df['text'] == ""].shape[0]))

Checking for null values...
text    4
dtype: int64

Checking how many empty string values in text column of dataframe...
0


In [4]:
df = df.fillna("supercalifragilisticexpialidocious")
print("Checking for null values...\n{}\n".format(df.isnull().sum()))
print('Checking how many empty string values in text column of dataframe...\n{}'.format(df['text'][df['text'] == ""].shape[0]))

Checking for null values...
text    0
dtype: int64

Checking how many empty string values in text column of dataframe...
0


### Topic Modeling

In [5]:
train = df['text']
train[:5]

0    try find Qualia Mind sleep Oura ring mind shar...
1    food app ability retrieve nutritional info tex...
2    audiovideo record life capture bit obtrusive t...
3    major privacy concern little actual benefit pr...
4    introductory stat know study design affect con...
Name: text, dtype: object

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer(max_features=10000, max_df=.15)
X = cvect.fit_transform(train)
pd.DataFrame(X.toarray(), columns = cvect.get_feature_names()).head()

Unnamed: 0,a_st,a_xz,aaa,aaaamazing,aaaand,aaand,aaron,ab,abab,abandon,...,zinc,zip,zombie,zone,zoo,zoom,zpg,zw,ótimo,ಠ_ಠ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5, 
                                learning_method="batch",
                                max_iter=25, 
                                random_state=0)

document_topics = lda.fit_transform(X)

In [8]:
print("lda.components_.shape: {}".format(lda.components_.shape))

lda.components_.shape: (5, 9176)


In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d:"  % topic_idx
        message += " ".join([feature_names[i]
                            for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

print_top_words(lda, cvect.get_feature_names(), 15)

Topic #0:quantified self read personal survey help share com model life research people theory year video
Topic #1:account message quantifiedself subreddit review automatically spam compose moderator karma comment bot hello request contact
Topic #2:google mood start export people help need lot add android log etc year tool activity
Topic #3:blood test gyroscope health battery glucose monitor apple arc measure need device lab run accurate
Topic #4:sleep heart rate fitbit device watch tracker hr basis measure wearable monitor wear hrv activity

