# Initialization and Testing File

### Initialization

In [None]:
# Import new libraries
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import re
import spacy
import string
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Save and read data files from your Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# To load the example data file, which is a set of abstracts of academic articles;
doc_df = pd.read_csv('/content/drive/MyDrive/2024 Spring/Text Mining/Projects/Project2/topic_model_train_wo_label.csv')

### Testing Data File

In [None]:
# Check the first 3 rows and add another column with the number of words per Abstract
doc_df['ABSTRACT_word_count'] = doc_df['ABSTRACT'].apply(lambda x: len(str(x).split(" ")))
doc_df.head(3)

In [None]:
# Show a Histogram of the word count per abstract
plt.hist(doc_df['ABSTRACT_word_count'])
plt.xlabel("Words per Abstract")
plt.ylabel("Abstracts")
plt.show()

# Making A Preprocessor (For LDA. May have to be changed for sentence embedding)

In [None]:
# By looking at some of the abstracts, we can decide if there are certain patterns that we should take out with the preprocessor

# for i in range(12): # Use to see first 12 Abstracts
for i in range(1): # Use to see first Abstract
    abstract = doc_df['ABSTRACT'].iloc[i]
    print(f'Abstract {i+1}:')
    print(abstract, "\n" + "-"*80, "\n")

In [None]:
spacy_lemma = spacy.load("en_core_web_sm")
def my_preprocessor(text):
  """
  Parameters:
    text: (str)

  Changes:
    Converts text to lowercase
    Removed Markdown code
    Removed numbers
    Removed stop words (english and spanish)
    Removed parentheses (and everything inside them)
    Removed punctuation
    Lemmatizes

  """

  # Makes text lowercase
  text_lower = text.lower()

  # adds flag in texts as period for sentence embedding
  # period is replaced with periodflag
  # if sentence_embedding:
    # text_lower = text.replace('. ', 'periodflag ')

  # Remove parentheses and anything inside them
  text_paren = re.sub(r'\(.*?\)', '', text_lower)

  # Remove numbers
  text_num = re.sub(r'\d+', '', text_paren)

  # Remove markdown / LaTeX code (starts with '\' or '$' or contain underscores)
  text_clean = re.sub(r'[\$\\]\S+|\b\w*_\w*\b', '', text_num)

  # Split text into words (also gets rid of punctuation)
  tokens = RegexpTokenizer(r'\w+').tokenize(text_clean)

  # Define stop words
  stop_words = set(stopwords.words('english')) | set(stopwords.words('spanish'))

  # Removes stopwords
  stopunct_tokens = []
  for token in tokens:
    if token not in stop_words:
      stopunct_tokens.append(token)

  # Combines text for and processes with SpaCy
  text_processed_0 = ' '.join(stopunct_tokens)
  text_spacy = spacy_lemma(text_processed_0)

  # Lemmatization
  pos_tags = {'NOUN', 'ADJ', 'VERB', 'ADV'}
  lemma_tokens = []
  for token in text_spacy:
    if token.pos_ in pos_tags:
      lemma_tokens.append(token.lemma_)

  common_words = [
      'approach',
      'consider',
      'define',
      'different',
      'feature',
      'first',
      'general',
      'however',
      'known',
      'method',
      'network',
      'number',
      'obtain',
      'present',
      'problem',
      'propose',
      'provide',
      'result'
  ]

  # Keep words > 4 letters and rid of common words with little meaning
  final_tokens = []
  for token in lemma_tokens:
    if len(token) > 5 and token not in common_words:
    # if len(token) > 5:
      final_tokens.append(token)

  # Make one string again
  text_processed = ' '.join(final_tokens)

  return text_processed

In [None]:
# Test the first abstract
abstract = doc_df['ABSTRACT'].iloc[1]
print(my_preprocessor(abstract))
print(abstract)


# LDA Topic Modeling


### Initialization

In [None]:
# Import new libraries
!pip install pyLDAvis
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis.lda_model as sklearnvis
import seaborn as sns
from gensim.models import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE

### LDA Function from SKLearn

###### Vectorize text

In [None]:
# Use TF-IDF vectorizer to turn abstracts into vectors (may take 6+ minutes)
my_vectorizer = TfidfVectorizer(preprocessor=my_preprocessor, max_features = 7500)
abstract_vectorized = my_vectorizer.fit_transform(doc_df['ABSTRACT'])

In [None]:
# Define the LDA Model
# After running 10+ topics, using 4 seems realistic because it always separates the data

lda_model = LatentDirichletAllocation(n_components= 4, # Number of topics
                                    doc_topic_prior = None, # Default is 1/n_documents
                                    topic_word_prior = None, # Default is 1/n_documents
                                    learning_method='batch',  # 'batch' runs slower, but generalizes. 'online' updates iteratively
                                    random_state= 10,
                                    max_iter=10) # The number of epoches for the training (how many times you wlll go through the entire corpus)

In [None]:
# Fit LDA Model to TF-IDF Vectors (will take 1+ minutes)
lda_top=lda_model.fit_transform(abstract_vectorized)

##### Explore output

In [None]:
# Print out the top 10 word tokens in each topic
# The output (tokens and their relevance for each topic) of the LDA model can be accessed through either or lad_top or lda_model.components_
vocab = my_vectorizer.get_feature_names_out()
for i, comp in enumerate(lda_model.components_):
    terms_comp = zip(vocab, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i+1)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print("\n")

In [None]:
# Visualize the results using TSNE (May take 2+ minutes)
TSNE_model = TSNE(n_components=2, verbose=1, random_state=10, angle=.99, init='pca')
tsne_lda = TSNE_model.fit_transform(lda_top)
TSNE_df = pd.DataFrame(tsne_lda, columns=['TSNE1', 'TSNE2'])
TSNE_df['topic'] = np.argmax(lda_top, axis=1)
sns.scatterplot(x="TSNE1", y="TSNE2", hue="topic", data=TSNE_df)
plt.show()

In [None]:
pyLDAvis.enable_notebook()
vis_data = sklearnvis.prepare(lda_model, abstract_vectorized, my_vectorizer)
pyLDAvis.display(vis_data)

### LDA with Gensim

##### Vectorize Text

In [None]:
id2word = corpora.Dictionary([[word] for word in my_vectorizer.get_feature_names_out()])

# Converts preprocessed text to Gensim's corpus format
def gensim_prep(preprocessed_texts):
    tokens = [text.split() for text in preprocessed_texts]
    corpus = [id2word.doc2bow(token) for token in tokens]
    return id2word, corpus, preprocessed_texts, tokens

preprocessed_texts = [" ".join(text.split()) for text in doc_df['ABSTRACT']]
# id2word, corpus, doc_processed, doc_tokens = gensim_prep(preprocessed_texts)
id2word, corpus, doc_preprocessed, doc_tokens = gensim_prep(preprocessed_texts)

In [None]:
# Build Model (May take 2+ minutes)
Num_of_Topics = 4
lda_model_gensim = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=Num_of_Topics,
    random_state=10,
    update_every=1,
    chunksize=250,
    passes=30,
    iterations = 40,
    alpha='auto',
    eta='auto',
    minimum_probability=0.0001,
    per_word_topics=True
)

##### Explore Output

In [None]:
# Print the topics with their top 10 words
topics_gensim = lda_model_gensim.print_topics(num_words=10)
for topic in topics_gensim:
    print(topic)

In [None]:
# Compute coherence score
coherence_model_lda = CoherenceModel(model=lda_model_gensim, texts=doc_tokens, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda*100, '%')

In [None]:
# Visualize LDA
pyLDAvis.enable_notebook()
vis_data = gensimvis.prepare(lda_model_gensim, corpus, id2word)
pyLDAvis.display(vis_data)

# Sentence Embedding + Clustering

### Initialization

In [None]:
# Import new libraries
!pip install -U sentence-transformers
import scipy.cluster.hierarchy as sch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler

### Doc2Vec, Hierarchical Clustering via SKLearn, and Output via TSNE

In [None]:
# Process all the text (May take 6+ minutes)
doc_processed = [my_preprocessor(text) for text in doc_df['ABSTRACT']]

In [None]:
# (May take 2+ minutes)
tagged_docs = [TaggedDocument(words=word_tokenize(doc), tags=[str(i)]) for i, doc in enumerate(doc_processed)]

# Make and train the model
model = Doc2Vec(vector_size=40,
                min_count=3, # filter out the infrequent tokens, whose term frequency is lower than min_count
                epochs=30)


model.build_vocab(tagged_docs)
model.train(tagged_docs,
            total_examples=model.corpus_count,
            epochs=model.epochs)

In [None]:
# Get document vectors (May take 2+ minutes)
doc_vectors = [model.infer_vector(word_tokenize(doc)) for doc in doc_processed]

# Cluster the documents
num_of_clusters = 4

my_clustering = AgglomerativeClustering(n_clusters=num_of_clusters,  linkage='ward')
my_clustering.fit_predict(doc_vectors)
cluster_assignment = my_clustering.labels_

In [None]:
# Display Results
results_df = pd.DataFrame({'ABSTRACT': doc_df['ABSTRACT']})

cluster_assignment_series = pd.Series(cluster_assignment, name='Cluster')
results_df = pd.DataFrame({
    'ABSTRACT': doc_df['ABSTRACT'],
    'PROCESSED_TEXT': doc_processed,
    'Cluster': cluster_assignment
})
results_df.head()


In [None]:
# Display Dendrogram (May take 2+ minutes)
sch.dendrogram(sch.linkage(doc_vectors, method='ward'))
plt.show()

In [None]:
# Convert vectors into numpy array for tsne (May take 4+ minutes)
embeddings = np.array(doc_vectors)
TSNE_model = TSNE(n_components=2, verbose=1, random_state=10, angle=.85, init='pca', perplexity=30)
reduced_embeddings = TSNE_model.fit_transform(embeddings)

# Display Results
TSNE_df = pd.DataFrame(reduced_embeddings, columns=['TSNE1', 'TSNE2'])
TSNE_df['topic'] = results_df['Cluster']
sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hue=results_df['Cluster'], data=TSNE_df)
plt.show()

### SBERT (all-MiniLM-L6-v2), Hierarchical Clustering SKLearn, and Output via Dendrogram and TSNE

In [None]:
# sbert_model = SentenceTransformer("allenai-specter")
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# May take 16+ min
# Use sbert model to embed, using doc_processed from Doc2Vec test
embeddings = sbert_model.encode(doc_processed, batch_size = 128, show_progress_bar=True)

In [None]:
# Display Dendrogram (May take 2+ minutes)
sch.dendrogram(sch.linkage(embeddings, method='ward'))
plt.show()

In [None]:
# May take 2+ minutes
embeddings = StandardScaler().fit_transform(embeddings)
num_of_clusters = 4
my_clustering = AgglomerativeClustering(n_clusters=num_of_clusters,  linkage='ward')
my_clustering.fit_predict(embeddings)
cluster_assignment = my_clustering.labels_

In [None]:
results_df = pd.DataFrame({'ABSTRACT': doc_df['ABSTRACT']})

cluster_assignment_series = pd.Series(cluster_assignment, name='Cluster')
results_df = pd.DataFrame({
    'ABSTRACT': doc_df['ABSTRACT'],
    'PROCESSED_TEXT': doc_processed,
    'Cluster': cluster_assignment
})
results_df.head()

In [None]:
# Convert vectors into numpy array for tsne (May take up to 3+ minutes)
embeddings = np.array(embeddings)
TSNE_model = TSNE(n_components=2, verbose=1, random_state=10, angle=.85, init='pca', perplexity=30)
reduced_embeddings = TSNE_model.fit_transform(embeddings)

# Display Results
TSNE_df = pd.DataFrame(reduced_embeddings, columns=['TSNE1', 'TSNE2'])
TSNE_df['topic'] = results_df['Cluster']
sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hue=results_df['Cluster'], data=TSNE_df)
plt.show()