Preprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from google.colab import drive
drive.mount('/content/drive')

doc_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Folder/topic_model_train_new.csv')

import nltk
from nltk.tokenize import word_tokenize
import re

# Load stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Load punk_tab
import string # to remove punctuations
nltk.download('punkt_tab')

# Load SpaCy to use its Lemmatization function
import spacy
nlp = spacy.load("en_core_web_sm", disable = ['ner','parser'])

# Define additional stopwords common in the pyLDAvis visualization
custom_stopwords = set([
    "author", "learn", "description", "method", "result",
    "paper", "study", "analysis", "research", "use", "model",
    "approach", "different", "propose", "find", "work", "differ",
    "new", "set", "paper", "show", "information", "process", "present",
    "process", "approach", "problem", "gross"
])

# Merge with NLTK's stopwords
all_stopwords = set(stopwords.words('english')).union(custom_stopwords)


def my_preprocessing(text):
  text_processed = []
  #step 0: remove the extra line separaters
  text_one_line = text.replace('\n',' ')
  #step 1: lower case
  text_lower = text_one_line.lower()
  #step 2: remove math expressions and numbers
  text_wo_math = re.sub(r'\$.*?\$', '', text_lower)
  text_wo_numbers = re.sub(r'\d+', '', text_wo_math)
  #step 3: remove stopwords and punctuations
  tokens = word_tokenize(text_wo_numbers)
  tokens_processed = []
  for token in tokens:
    if (token not in string.punctuation) and (token not in all_stopwords):
      tokens_processed.append(token)
  #step 4: lemmatization using SpaCy and also remove short words
  text_processed = ' '.join(tokens_processed)
  allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV']
  doc = nlp(text_processed)
  token_lemma = [token.lemma_ for token in doc if (token.pos_ in allowed_postags) and (len(token) > 2)]
  text_processed = ' '.join(token_lemma)
  return text_processed

In [None]:
sample_corpus = doc_df['ABSTRACT'].sample(5)

# do your test here
for text in sample_corpus:
  print(text)
  print(my_preprocessing(text))
  print('\n')

LDA Model

In [None]:
!pip install numpy --upgrade
!pip install gensim --upgrade --force-reinstall

# Use TF-IDF vectorizer to turn abstracts into vectors
from sklearn.feature_extraction.text import TfidfVectorizer
my_vectorizer = TfidfVectorizer(preprocessor=my_preprocessing, max_features = 5000)
abstract_vectorized = my_vectorizer.fit_transform(doc_df['ABSTRACT'])

from sklearn.decomposition import LatentDirichletAllocation
from gensim.models import CoherenceModel
from gensim import corpora

coherence_scores = {}

# Try different numbers of topics
for num_topics in [5, 7, 9, 11]:  # Try different values for n_components
    print(f"Training LDA with {num_topics} topics...")

    lda_model = LatentDirichletAllocation(n_components=num_topics, # Number of topics
                                    doc_topic_prior = None, # Default is 1/n_documents
                                    topic_word_prior = None, # Default is 1/n_documents
                                    learning_method='online',  # Decide how often the training will update the model. Will be faster to do it "online" vs "batch"
                                    random_state= 42,
                                    max_iter=10) # The number of epoches for the training (how many times you wlll go through the entire corpus)

    lda_top=lda_model.fit_transform(abstract_vectorized)

        # Function to extract topic words from Scikit-Learn LDA
    def get_sklearn_topics(model, feature_names, n_top_words=10):
        topics = []
        for topic_idx, topic in enumerate(model.components_):
            top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
            topics.append(top_words)
        return topics

    # Get feature names and extract topics
    feature_names = my_vectorizer.get_feature_names_out()
    sklearn_topics = get_sklearn_topics(lda_model, feature_names)

    # Convert preprocessed text data into tokenized form
    doc_df['Processed'] = doc_df['ABSTRACT'].apply(lambda x: my_preprocessing(x).split())  # Convert preprocessed text into list of words
    dictionary = corpora.Dictionary(doc_df['Processed'])
    corpus = [dictionary.doc2bow(text) for text in doc_df['Processed']]

    coherence_model_sklearn = CoherenceModel(topics=sklearn_topics, texts=doc_df['Processed'], dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model_sklearn.get_coherence()
    coherence_scores[num_topics] = coherence_score

    print(f"Coherence Score for {num_topics} topics: {coherence_score}")

print("Final Coherence Scores:")
for k, v in coherence_scores.items():
    print(f"{k} Topics: {v}")


In [None]:
# Train LDA model with the best number of topics (5)
best_num_topics = 5  # Since 5 had the highest coherence score

lda_model_best = LatentDirichletAllocation(n_components=best_num_topics,
                                           learning_method='online',
                                           random_state=42,
                                           max_iter=10)

# Fit the model with TF-IDF matrix
lda_top_best = lda_model_best.fit_transform(abstract_vectorized)

In [None]:
def get_sklearn_topics(model, feature_names, n_top_words=10):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics.append(top_words)
    return topics

# Get feature names and extract topics
feature_names = my_vectorizer.get_feature_names_out()
sklearn_topics = get_sklearn_topics(lda_model_best, feature_names)

perplexity_score = lda_model_best.perplexity(abstract_vectorized)

print(f"Scikit-Learn LDA Perplexity Score: {perplexity_score}")

In [None]:
# Print out the top 10 word tokens in each topic
vocab = my_vectorizer.get_feature_names_out()
for i, comp in enumerate(lda_model_best.components_):
    terms_comp = zip(vocab, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print("\n")


In [None]:
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.lda_model as sklearnvis

pyLDAvis.enable_notebook()
vis_data = sklearnvis.prepare(lda_model_best, abstract_vectorized, my_vectorizer)
pyLDAvis.display(vis_data)

Testing of Embedding + Clustering Techniques

In [None]:
# Sample df
sample_df = doc_df.sample(2000, ignore_index = True)
sample_docs = sample_df['ABSTRACT'].tolist()

docs = sample_docs

# Model df
model_docs = doc_df['ABSTRACT'].tolist()

Doc2Vec

In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

tagged_docs = []
for i,doc in enumerate(docs):
  tagged_docs.append(TaggedDocument(words=word_tokenize(doc.lower()), tags=[str(i)]))

# train the Doc2vec model
model = Doc2Vec(vector_size=20,
                min_count=2,
                epochs=50)
model.build_vocab(tagged_docs)
model.train(tagged_docs,
            total_examples=model.corpus_count,
            epochs=model.epochs)

# get the document vectors
document_vectors = [model.infer_vector(word_tokenize(my_preprocessing(doc))) for doc in docs]

In [None]:
import scipy.cluster.hierarchy as sc

sc.dendrogram(sc.linkage(document_vectors, method='ward'))
plt.show()

In [None]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering

for k in range(5, 10):  # Try different cluster numbers
    my_clustering = AgglomerativeClustering(n_clusters=k, linkage='ward')
    cluster_labels = my_clustering.fit_predict(document_vectors)
    score = silhouette_score(document_vectors, cluster_labels)
    print(f"Clusters: {k}, Silhouette Score: {score:.4f}")

In [None]:
from sklearn.cluster import KMeans

# Try different cluster numbers with K-Means
for k in range(5, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels_kmeans = kmeans.fit_predict(document_vectors)
    score = silhouette_score(document_vectors, cluster_labels_kmeans)
    print(f"K-Means Clusters: {k}, Silhouette Score: {score:.4f}")

SBERT

In [None]:
!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

sbert_embeddings = sbert_model.encode(docs)

In [None]:
import scipy.cluster.hierarchy as sc

sc.dendrogram(sc.linkage(sbert_embeddings, method='ward'))
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

for k in range(5, 10):  # Try different cluster numbers
    my_clustering = AgglomerativeClustering(n_clusters=k, linkage='ward')
    cluster_labels = my_clustering.fit_predict(sbert_embeddings)
    score = silhouette_score(sbert_embeddings, cluster_labels)
    print(f"Clusters: {k}, Silhouette Score: {score:.4f}")

In [None]:
for k in range(5, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels_kmeans = kmeans.fit_predict(sbert_embeddings)
    score = silhouette_score(sbert_embeddings, cluster_labels_kmeans)
    print(f"K-Means Clusters: {k}, Silhouette Score: {score:.4f}")

Universal Sentence Embedding

In [None]:
import tensorflow_hub as hub

use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

use_embeddings = use_model(docs)

sc.dendrogram(sc.linkage(use_embeddings, method='ward'))
plt.show()

In [None]:
for k in range(5, 10):  # Try different cluster numbers
    my_clustering = AgglomerativeClustering(n_clusters=k, linkage='ward')
    cluster_labels = my_clustering.fit_predict(use_embeddings)
    score = silhouette_score(use_embeddings, cluster_labels)
    print(f"Clusters: {k}, Silhouette Score: {score:.4f}")

In [None]:
for k in range(5, 10):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    cluster_labels_kmeans = kmeans.fit_predict(use_embeddings)
    score = silhouette_score(use_embeddings, cluster_labels_kmeans)
    print(f"K-Means Clusters: {k}, Silhouette Score: {score:.4f}")

Model Build of Doc2Vec + K Means

In [None]:
model_tagged_docs = [
    TaggedDocument(words=my_preprocessing(doc).split(), tags=[str(i)]) for i, doc in enumerate(doc_df['ABSTRACT'])
]

doc_model = Doc2Vec(vector_size=20, min_count=2, epochs=50, workers=4)

doc_model.build_vocab(model_tagged_docs)

doc_model.train(model_tagged_docs, total_examples=doc_model.corpus_count, epochs=doc_model.epochs)

model_document_vectors = np.array([doc_model.infer_vector(my_preprocessing(doc).split()) for doc in doc_df['ABSTRACT']])


In [None]:
# Apply KMeans clustering
kmeans = KMeans(n_clusters=6, random_state=42, n_init=10)
cluster_labels_kmeans = kmeans.fit_predict(model_document_vectors)

# Compute Silhouette Score
silhouette_kmeans_doc2vec = silhouette_score(model_document_vectors, cluster_labels_kmeans)
print(f"Final K-Means Doc2Vec Silhouette Score: {silhouette_kmeans_doc2vec:.4f}")


doc_df['Cluster'] = cluster_labels_kmeans

# Show some results
print(doc_df[['TITLE', 'Cluster']].head())

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

model_docs = doc_df['ABSTRACT'].tolist()

# Convert abstracts into TF-IDF representation
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(model_docs)

# Extract feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Function to get top words for each cluster
def get_top_words_per_cluster(tfidf_matrix, cluster_labels, feature_names, n_words=10):
    clusters = np.unique(cluster_labels)
    top_words = []

    for cluster in clusters:
        # Get indices of documents in the current cluster
        cluster_indices = np.where(cluster_labels == cluster)[0]

        # Extract only the documents belonging to this cluster
        cluster_docs = tfidf_matrix[cluster_indices]

        # Compute average TF-IDF score for each word
        cluster_mean_tfidf = np.asarray(cluster_docs.mean(axis=0)).flatten()

        # Get top words for the cluster
        top_indices = cluster_mean_tfidf.argsort()[-n_words:][::-1]
        top_words.append([feature_names[i] for i in top_indices])

    return top_words

# Extract top words for each cluster
top_words_per_cluster = get_top_words_per_cluster(tfidf_matrix, cluster_labels_kmeans, feature_names)

# Print top words for each cluster
for i, words in enumerate(top_words_per_cluster):
    print(f"Cluster {i}: {', '.join(words)}")

In [None]:
import matplotlib.pyplot as plt

# Count articles per cluster
cluster_counts = doc_df['Cluster'].value_counts()

# Plot
plt.figure(figsize=(8,5))
cluster_counts.sort_index().plot(kind='bar', color='steelblue', edgecolor='black')
plt.title("Number of Articles Per Cluster")
plt.xlabel("Cluster")
plt.ylabel("Number of Articles")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

Exploration of Test Dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


doc_df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Folder/topic_model_test_new.csv')

doc_df_test.head()

# Sum up the number of articles in each subject
subject_counts = doc_df_test.iloc[:, 3:].sum()  # Summing only subject columns

# Print subject counts
print(subject_counts)

# Count articles with multiple subjects
doc_df_test['num_subjects'] = doc_df_test.iloc[:, 3:].sum(axis=1)  # Sum across rows
multi_topic_count = (doc_df_test['num_subjects'] > 1).sum()  # Count articles in multiple subjects

# Add multi-topic count to the subject count dictionary
subject_counts["Multiple Subjects"] = multi_topic_count
# Count articles with single and multiple topics
single_topic_count = (doc_df_test['num_subjects'] == 1).sum()
multi_topic_count = (doc_df_test['num_subjects'] > 1).sum()

# Create a DataFrame for visualization
topic_distribution = pd.DataFrame({
    'Count': [single_topic_count, multi_topic_count]
}, index=['Single Subject', 'Multiple Subjects'])

# Plot
topic_distribution.plot(kind='bar', color=['lightcoral', 'steelblue'], edgecolor='black', figsize=(8,5))
plt.title("Distribution of Articles by Single vs. Multiple Subject Assignment")
plt.xlabel("Category")
plt.ylabel("Number of Articles")
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

plt.figure(figsize=(10,6))
subject_counts.sort_values().plot(kind='bar', color='skyblue', edgecolor='black')

plt.title("Number of Articles in Each Subject (Including Multi-Topic Articles)")
plt.xlabel("Subjects")
plt.ylabel("Number of Articles")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Show plot
plt.show()

# Selecting only subject columns for correlation analysis
subject_columns = doc_df_test.iloc[:, 3:-1]  # Excludes num_subjects column
correlation_matrix = subject_columns.corr()

# Compute correlation matrix
correlation_matrix = subject_columns.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)

plt.title("Correlation Between Subject Tags")
plt.xlabel("Subjects")
plt.ylabel("Subjects")

plt.show()

Testing LDA Model

In [None]:
abstract_vectorized_test = my_vectorizer.transform(doc_df_test['ABSTRACT'])


lda_top_test = lda_model_best.transform(abstract_vectorized_test)
lda_top_test.shape

topic_df_test = pd.DataFrame(lda_top_test)
topic_df_test.columns = ['topic_' + str(i) for i in range(topic_df_test.shape[1])]
topic_df_test.reset_index(drop=True)

topic_df_test['topic_with_highest_score'] = topic_df_test.idxmax(axis=1)

doc_df_test_with_topics = pd.concat([doc_df_test, topic_df_test], axis=1)

doc_df_test_with_topics.info()
doc_df_test_with_topics.head()

In [None]:
topic_labels = ['Computer Science',	'Physics', 'Mathematics',	'Statistics',	'Quantitative Biology','Quantitative Finance']

for i in range(best_num_topics):
  topic_lda= 'topic_' + str(i)
  for topic_label in topic_labels:
    temp_df = doc_df_test_with_topics[doc_df_test_with_topics['topic_with_highest_score'] == topic_lda]
    print(topic_lda, '->', topic_label, temp_df[topic_label].sum())
  print('\n')

In [None]:
# Create a dictionary to store the assigned labels
topic_to_label = {}

# Loop through each LDA topic and assign the most frequent real label
for i in range(best_num_topics):
    topic_lda = 'topic_' + str(i)

    # Subset only documents assigned to this topic
    temp_df = doc_df_test_with_topics[doc_df_test_with_topics['topic_with_highest_score'] == topic_lda]

    # Find the most common real label in this topic
    most_common_label = temp_df[topic_labels].sum().idxmax()

    # Store in the dictionary
    topic_to_label[topic_lda] = most_common_label

# Display assigned topic labels
print("Assigned Topic Labels:", topic_to_label)

In [None]:
# Column to store the predicted subject based on LDA topics
doc_df_test_with_topics['Predicted_Subject'] = doc_df_test_with_topics['topic_with_highest_score'].map(topic_to_label)

# Show some results
print(doc_df_test_with_topics[['TITLE', 'topic_with_highest_score', 'Predicted_Subject']].head())

In [None]:
from sklearn.metrics import classification_report

# Get true labels (real subject per document)
true_labels = doc_df_test_with_topics[topic_labels].idxmax(axis=1)

# Get predicted labels (LDA assigned subjects)
predicted_labels = doc_df_test_with_topics['Predicted_Subject']

# Compute classification report
print(classification_report(true_labels, predicted_labels))

Testing Doc2Vec Model

In [None]:
from scipy.spatial.distance import cdist
from sklearn.metrics import classification_report


test_docs = doc_df_test['ABSTRACT'].tolist()

test_vectors = np.array([doc_model.infer_vector(my_preprocessing(doc).split()) for doc in test_docs])

closest_clusters = np.argmin(cdist(test_vectors, kmeans.cluster_centers_), axis=1)

# Store test cluster assignments
doc_df_test['Predicted_Cluster'] = closest_clusters

# Show results
print(doc_df_test[['TITLE', 'Predicted_Cluster']].head())


In [None]:
cluster_topic_counts = {}

# Loop through each cluster
for cluster in np.unique(closest_clusters):
    print(f"\nCluster {cluster}")  # Print cluster header

    # Subset only documents assigned to this cluster
    temp_df = doc_df_test[doc_df_test['Predicted_Cluster'] == cluster]

    # Loop through all subject labels and print counts
    for topic_label in topic_columns:
        topic_count = temp_df[topic_label].sum()
        print(f"Cluster {cluster} -> {topic_label}: {topic_count}")

In [None]:
topic_columns = ['Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']

# Find the majority label for each cluster
cluster_topic_mapping = {}

for cluster in np.unique(closest_clusters):
    cluster_indices = doc_df_test['Predicted_Cluster'] == cluster
    majority_topic = doc_df_test.loc[cluster_indices, topic_columns].sum().idxmax()
    cluster_topic_mapping[cluster] = majority_topic

# Assign mapped topics
doc_df_test['Mapped_Topic'] = doc_df_test['Predicted_Cluster'].map(cluster_topic_mapping)

# Show some results
print(doc_df_test[['TITLE', 'Predicted_Cluster', 'Mapped_Topic']].head())

In [None]:
# Get true labels
true_labels = doc_df_test[topic_columns].idxmax(axis=1)  # Assigns each article its actual topic
predicted_labels = doc_df_test['Mapped_Topic']

# Compute classification report
print(classification_report(true_labels, predicted_labels))