In [1]:
# Read dataset
import pandas as pd

dataset = pd.read_csv('/kaggle/input/essayscsv/essays.csv', encoding='ISO 8859-1')
dataset["TEXT"]

0       Well, right now I just woke up from a mid-day ...
1       Well, here we go with the stream of consciousn...
2       An open keyboard and buttons to push. The thin...
3       I can't believe it!  It's really happening!  M...
4       Well, here I go with the good old stream of co...
                              ...                        
2462         I'm home. wanted to go to bed but remembe...
2463         Stream of consiousnesssskdj. How do you s...
2464    It is Wednesday, December 8th and a lot has be...
2465    Man this week has been hellish. Anyways, now i...
2466    I have just gotten off the phone with brady. I...
Name: TEXT, Length: 2467, dtype: object

In [2]:
#Create vocabulary of all the words in the data by counting vectorizer.
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(dataset['TEXT'].values.astype('U'))
doc_term_matrix



<2467x14098 sparse matrix of type '<class 'numpy.int64'>'
	with 397484 stored elements in Compressed Sparse Row format>

In [3]:
# Use LDA to create topics along with the probability distribution for each word 
# in our vocabulary for each topic
from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

In [4]:
# Print the 10 words with highest probabilities for all the five topics
for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([count_vect.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['right', 'friends', 'college', 'school', 'things', 'going', 'people', 'life', 'want', 'feel']


Top 10 words for topic #1:
['love', 'miss', 'home', 'good', 'friends', 'school', 'feel', 'people', 'going', 'want']


Top 10 words for topic #2:
['writing', 'thoughts', 'god', 'feel', 'things', 'world', 'mind', 'life', 'way', 'people']


Top 10 words for topic #3:
['write', 'guess', 'hope', 'good', 'right', 'class', 'need', 'wonder', 'minutes', 'going']


Top 10 words for topic #4:
['maybe', 'going', 'love', 'wish', 'good', 'man', 'oh', 'wonder', 'want', 'need']




In [5]:
# Assign the probability of all the topics to each document.
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape

# Add a column to the original data frame that will store the topic for the text. 
dataset['TOPIC'] = topic_values.argmax(axis=1)
dataset.head()


Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,TOPIC
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y,0
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n,3
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y,3
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n,3
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y,0
