## Latent Dirichlet Allocation (LDA)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords

from sklearn.decomposition import LatentDirichletAllocation as LDA

import warnings
warnings.filterwarnings('ignore')

In [None]:
# import data.json
data = pd.read_json('data.json')

# create a list of all the words in the data
words = []
for i in range(len(data)):
    words += data['text'][i].split()

# remove stop words from the list
stop_words = set(stopwords.words('english'))
words = [word for word in words if word not in stop_words]

# create a list of all the unique words in the data
unique_words = list(set(words))

# create a matrix of the words in the data
word_matrix = np.zeros((len(data), len(unique_words)))

# fill the matrix with the number of times each word appears in each text
for i in range(len(data)):
    for j in range(len(unique_words)):
        word_matrix[i, j] = data['text'][i].split().count(unique_words[j])

# create a dataframe from the matrix
word_df = pd.DataFrame(word_matrix, columns = unique_words)

# create an LDA model
lda = LDA(n_components = 5)

# fit the model to the data
lda.fit(word_df)

# create a dataframe of the topics
topic_df = pd.DataFrame(lda.components_, columns = unique_words)

# print the top 10 words for each topic
for i in range(5):
    print('Topic', i)
    print(topic_df.iloc[i].sort_values(ascending = False).head(10))
    print()


