<a href="https://colab.research.google.com/github/jockylover/LSTM/blob/main/Dataset_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Dataset preparation

In this notebook, we will build and pre-process the arXiv paper dataset.

In [1]:
# imports
import sys
sys.path.insert(0, "../")
from dataset import ArXivDataset

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [3]:
import nltk

# Download the stopwords resource
nltk.download('stopwords')

# Now you can use the stopwords
from nltk.corpus import stopwords

# Example usage
stop_words = set(stopwords.words('english'))
print("Stopwords downloaded and loaded successfully.")


Stopwords downloaded and loaded successfully.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


I used a variety of techniques to pre-process the text, such as the removal of LaTex equations, tokenization, n-gram phrase detection, and lemmatization.

In [4]:
# build and pre-process texts
metadata_filepath = "/content/drive/MyDrive/arxiv-metadata-oai-snapshot.json"
dataset = ArXivDataset.from_metadata(metadata_filepath)
print("# papers: {n}".format(n=len(dataset)))

 [1/6] Removing LaTex equations...
 [2/6] Removing newlines and extra spaces...
 [3/6] Tokenizing documents...
 [4/6] Removing stopwords...
 [5/6] Identifying n-gram phrases...
 [6/6] Lemmatizing...
 Done.
# papers: 390151


Now that the texts have been pre-processed, they can be exported as a dataset object.

In [5]:
# export dataset
dataset.save("./data/dataset.obj")

In [6]:
!pip install bertopic sentence-transformers umap-learn hdbscan


Collecting bertopic
  Downloading bertopic-0.16.2-py2.py3-none-any.whl (158 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/158.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/227.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting umap-learn
  Downloading umap_learn-0.5.6-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting hdbscan
  Downloading hdbscan-0.8.37-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [

In [None]:
import sys
sys.path.insert(0, "../")
import os
import numpy as np
import pandas as pd
from dataset import ArXivDataset
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import seaborn as sns
import logging

sns.set_context("talk")
sns.set_style("ticks")

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def main():
    try:
        # Load dataset
        dataset_path = "./data/dataset.obj"
        dataset = ArXivDataset.load(dataset_path)
        logging.info(f"# papers: {len(dataset)}")

        # Ensure all documents are strings
        documents = [str(doc) for doc in dataset.documents]

        # Load sentence transformer model
        sentence_model = SentenceTransformer("all-MiniLM-L6-v2")

        # Embed documents
        logging.info("Embedding documents...")
        embeddings = sentence_model.encode(documents, show_progress_bar=True)

        # Create BERTopic model
        topic_model = BERTopic(nr_topics="auto", calculate_probabilities=True)

        # Fit the model
        logging.info("Fitting BERTopic model...")
        topics, probs = topic_model.fit_transform(documents, embeddings)

        # Get topic coherence
        logging.info("Calculating coherence score...")
        coherence_score = topic_model.get_coherence("c_v")
        logging.info(f"Coherence score: {coherence_score:.3f}")

        # Plot topic sizes
        fig, ax = plt.subplots(1, 1, figsize=(6, 4))
        ax.set_title("Topic sizes")
        topic_sizes = topic_model.get_topic_sizes()
        sns.barplot(x=topic_sizes.index, y=topic_sizes.values, color="#DB5461", ax=ax)
        ax.set_xlabel("Topic")
        ax.set_ylabel("Size")
        plt.show()

        # Save the model
        model_path = "./models/bertopic_model"
        topic_model.save(model_path)
        logging.info(f"Model saved to {model_path}")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)

if __name__ == '__main__':
    main()


Batches:   0%|          | 0/12193 [00:00<?, ?it/s]

  pid = os.fork()
