In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import pyLDAvis.sklearn
import tqdm

In [3]:
# read the csv file and extract the title column as a Dask DataFrame
df = pd.read_csv("Heart_disease.csv")
titles = df['Title']
# make into a list
titles = titles.tolist()

In [4]:
# define a function to preprocess the titles
def preprocess(title):
    # remove any non-alphabetical characters
    title = re.sub('[^a-zA-Z]', ' ', title)
    # convert all characters to lowercase
    title = title.lower()
    # remove stopwords
    title = ' '.join([word for word in title.split() if word not in stopwords.words('english')])
    # replace multiple spaces with a single space
    title = re.sub(' +', ' ', title)
    return title

In [5]:
# preprocess the titles
processed_titles = [preprocess(title) for title in tqdm.tqdm(titles)]

100%|██████████| 86904/86904 [02:15<00:00, 640.91it/s]


In [6]:
# create a CountVectorizer object
vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
matrix = vectorizer.fit_transform(processed_titles)

In [18]:
lda = LDA(n_components=16, max_iter=15, learning_method='batch', random_state=42)
lda.fit(matrix)

In [8]:
for idx, topic in enumerate(lda.components_):
    print(f"Top 5 words in Topic #{idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]])
    print('')

Top 5 words in Topic #0:
['ii', 'lung', 'angiotensin', 'ca', 'role']

Top 5 words in Topic #1:
['carotid', 'association', 'study', 'adults', 'stress']

Top 5 words in Topic #2:
['disease', 'atherosclerosis', 'cardiovascular', 'risk', 'study']

Top 5 words in Topic #3:
['renal', 'novel', 'arterial', 'hypertension', 'pulmonary']

Top 5 words in Topic #4:
['acute', 'infarction', 'left', 'ventricular', 'myocardial']

Top 5 words in Topic #5:
['therapy', 'vivo', 'women', 'nitric', 'oxide']

Top 5 words in Topic #6:
['development', 'tissue', 'response', 'activation', 'cardiac']

Top 5 words in Topic #7:
['remodeling', 'cell', 'vascular', 'receptor', 'endothelial']

Top 5 words in Topic #8:
['kinase', 'dependent', 'factor', 'blood', 'protein']

Top 5 words in Topic #9:
['low', 'high', 'increased', 'rats', 'associated']

Top 5 words in Topic #10:
['ischemia', 'cells', 'smooth', 'effects', 'muscle']

Top 5 words in Topic #11:
['specific', 'rat', 'human', 'model', 'cells']

Top 5 words in Topic 

In [19]:
for idx, topic in enumerate(lda.components_):
    print(f"Top 5 words in Topic #{idx}:")
    print([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-5:]])
    print('')

Top 5 words in Topic #0:
['matrix', 'resonance', 'magnetic', 'cardiomyopathy', 'cardiac']

Top 5 words in Topic #1:
['gene', 'receptor', 'expression', 'cardiac', 'protein']

Top 5 words in Topic #2:
['study', 'diabetes', 'disease', 'risk', 'cardiovascular']

Top 5 words in Topic #3:
['aortic', 'imaging', 'using', 'left', 'ventricular']

Top 5 words in Topic #4:
['multi', 'association', 'genetic', 'study', 'atherosclerosis']

Top 5 words in Topic #5:
['tissue', 'stem', 'human', 'cells', 'cell']

Top 5 words in Topic #6:
['ii', 'induced', 'angiotensin', 'stress', 'mice']

Top 5 words in Topic #7:
['arteries', 'artery', 'arterial', 'hypertension', 'pulmonary']

Top 5 words in Topic #8:
['ischemic', 'myocardial', 'lung', 'ischemia', 'injury']

Top 5 words in Topic #9:
['nitric', 'oxide', 'growth', 'factor', 'endothelial']

Top 5 words in Topic #10:
['patients', 'coronary', 'disease', 'failure', 'heart']

Top 5 words in Topic #11:
['rat', 'mouse', 'ca', 'model', 'effects']

Top 5 words in T

In [20]:
# create a table with each document and its top word (i.e. the topic with the highest probability
table = pd.DataFrame()
table['title'] = titles
table['topic'] = lda.transform(matrix).argmax(axis=1)
# go through the topic list and replace the topic number with the top word in that topic
for i in range(len(table['topic'])):
    table['topic'][i] = vectorizer.get_feature_names_out()[lda.components_[table['topic'][i]].argmax()]








A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table['topic'][i] = vectorizer.get_feature_names_out()[lda.components_[table['topic'][i]].argmax()]
