In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import requests
from bs4 import BeautifulSoup
from pymedtermino import *
from pymedtermino.all import *

In [2]:
# read the csv file and extract the title column
df = pd.read_csv("Heart_disease.csv")
titles = df['Title']
regex = re.compile('[^a-zA-Z]')
for index, title in enumerate(tqdm(titles)):
    # remove any non-alphabetical characters
    # title = regex.sub(' ', title)
    # convert all characters to lowercase
    title = title.lower()
    # remove stopwords
    title = ' '.join([word for word in title.split() if word not in stopwords.words('english')])
    # replace multiple spaces with a single space
    title = re.sub(' +', ' ', title)
    # add to the list
    titles[index] = title
data = list(titles)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles[index] = title
100%|██████████| 86904/86904 [02:26<00:00, 591.22it/s]


In [3]:
# Create a TfidfVectorizer object to transform the article titles into a feature matrix
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
X = vectorizer.fit_transform(data)

In [4]:
# Use KMeans clustering to group the article titles into clusters
kmeans = KMeans(n_clusters=16, random_state=0).fit(X)



# Print the top terms for each cluster
order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in tqdm(range(kmeans.n_clusters)):
    print(f"Cluster {i} top terms:")
    for j in order_centroids[i, :5]:
        print(f"    {terms[j]}")
    print()

100%|██████████| 16/16 [00:00<00:00, 31941.39it/s]

Cluster 0 top terms:
    cardiac
    cardiac arrest
    arrest
    hypertrophy
    cardiac myocytes

Cluster 1 top terms:
    nitric
    nitric oxide
    oxide
    oxide synthase
    synthase

Cluster 2 top terms:
    atrial
    atrial fibrillation
    fibrillation
    patients
    patients atrial

Cluster 3 top terms:
    heart
    induced
    mice
    receptor
    vascular

Cluster 4 top terms:
    endothelial
    endothelial cells
    cells
    endothelial cell
    cell

Cluster 5 top terms:
    heart failure
    failure
    heart
    patients
    ejection

Cluster 6 top terms:
    coronary
    disease
    coronary artery
    artery
    heart disease

Cluster 7 top terms:
    blood pressure
    pressure
    blood
    heart study
    study

Cluster 8 top terms:
    cardiovascular
    cardiovascular disease
    risk
    disease
    study

Cluster 9 top terms:
    myocardial infarction
    infarction
    myocardial
    acute myocardial
    acute

Cluster 10 top terms:
    associated
  




In [8]:
# Print the cluster assignments for each article title
pairings = {}
for i in tqdm(range(len(data))):
    label = kmeans.labels_[i]
    topic = terms[order_centroids[label, 0]]
    pairings[data[i]] = topic
df['Topic'] = df['Title'].map(pairings)

100%|██████████| 86904/86904 [00:00<00:00, 2118395.92it/s]


In [9]:
# save the dataframe to a csv file
df.to_csv('Heart_disease_with_topics.csv', index=False)