In [None]:
# Topic modeling uses statistical models to discover the topics present in a text or group of texts.
# We're going to use the LDA (Latent Dirichlet Allocation) approach for topic modeling in this module.
# But first, we're going to explore some visualization techniques we can apply to our text.

In [None]:
# First, we'll load our CSV, look at the data and see what information we can glean from it,
# then create a bar graph of the parts of speech in the CSV we created earlier.

In [None]:
import pandas
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt

# Open our CSV and read the data
document = pandas.read_csv('text_data.csv')

# Print the first 5 lines of our document to check that it loaded properly
print(document.head())

# Detect our Parts of Speech ('pos') column in the CSV
pos_group = document.groupby('pos')

# Print a summary statistic of all parts of speech grouped together
print(pos_group.describe(include='all'))

In [None]:
# Now that we've learned how to manipulate the data a bit, we'll create a wordcloud with our data

In [None]:
# Join the different words in our CSV into one long string
long_string = ','.join(list(document.text.values))

# Set up our wordcloud parameters
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                min_font_size = 10).generate(long_string) 
  
# create the wordcloud as an image                   
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

In [None]:
# Finally, we'll graph the 10 most common words in the text

In [None]:
from collections import Counter
import numpy as np

# Set the text of our document as the word list
word_list = list(document.text.values)

# Get the 10 most common words from the list
counts = dict(Counter(word_list).most_common(10))

# Set up our labels
labels, values = zip(*counts.items())

# Sort our values in descending order
indSort = np.argsort(values)[::-1]

# Rearrange our data
labels = np.array(labels)[indSort]
values = np.array(values)[indSort]

indexes = np.arange(len(labels))

bar_width = 0.35

plt.bar(indexes, values)

# Add our labels
plt.xticks(indexes + bar_width, labels)
plt.show()

In [None]:
# Now we'll use LDA model in SciKit Learn to conduct our topic modeling
# You can read more about topic modeling in Python at this link: 
# https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730
# The article above also provides code on how to use SciKit Learn to perform
# NMF topic modeling on a text, which might be used and compared to our LDA results.

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel, LdaMulticore
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
logging.root.setLevel(level=logging.INFO)

# How to create a dictionary from a list of sentences?

our_files = ['cleaned_text.txt']

with open('cleaned_text.txt') as f:
    content = f.readlines()
    content = [x.strip() for x in content] 

words = [[word for word in line.split()] for line in content]

# Create dictionary
dictionary = corpora.Dictionary(words)
corpus = [dictionary.doc2bow(line) for line in words]

# Get information about the dictionary
print(dictionary)

'''
# Save the Dict and Corpus
dictionary.save('my_dict.dict')  # save dict to disk
corpora.MmCorpus.serialize('my_corpus.mm', corpus)  # save corpus to disk

# Load them back
loaded_dict = corpora.Dictionary.load('my_dict.dict')

corpus = corpora.MmCorpus('my_corpus.mm')
'''

# Step 0: Import packages and stopwords


# Step 4: Train the LDA model
lda_model = LdaMulticore(corpus=corpus,
                         id2word=dictionary,
                         random_state=100,
                         num_topics=5,
                         passes=10,
                         chunksize=1000,
                         batch=False,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=100,
                         gamma_threshold=0.001,
                         per_word_topics=True)

# save the model
lda_model.save('lda_model.model')

# See the topics
lda_model.print_topics(-1)

with open("lda_output.txt", "a") as lda_txt:
            topics=lda_model.top_topics(corpus)
            lda_txt.write('\n'.join('%s %s' %topic for topic in topics))