## Jonatas Cesar 

### Objective
The task is develop an unsupervised model which classifies abstracts into a topic.
Indeed, the goal is to group abstracts based on their semantic similarity.

### Download documents and install dependencies

In [1]:
%%capture
!pip install bertopic lxml

In [2]:
%%capture
%%sh 
mkdir data
wget https://www.nsf.gov/awardsearch/download\?DownloadFileName\=2020\&All\=true -O temp.zip
unzip -d data temp.zip
rm temp.zip

## Resolution of the task

In [3]:
%%capture
from pathlib import Path

import lxml.etree as ET
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

In [4]:
def get_abstract_from_xml(xml_path: str) -> str:
    try:
        tree = ET.parse(xml_path)
        abstract_html = str(tree.find(".//AbstractNarration").text)
        abstract = abstract_html.replace("&lt;br/&gt;", "")
        return abstract
    except Exception as e:
        print("Error: ", e)

In [5]:
files = sorted(Path("data").rglob("*.xml"))
abstracts = [_ for _ in [get_abstract_from_xml(str(f)) for f in files] if _ != "None"]

In [6]:
# Step 1 - Extract embeddings (
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=30, n_components=5, min_dist=0.0, metric="cosine")

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(
    min_cluster_size=50,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df = 30, max_df=0.9)

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 -  Fine-tune topic representations with
# `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model,
    top_n_words=10,
)

In [7]:
topics, probs = topic_model.fit_transform(abstracts)

In [8]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,5735,-1_molecular_chemistry_chemical_devices,"[molecular, chemistry, chemical, devices, data...",[The maturation of energy-harvesting (EH) tech...
1,0,592,0_matter_discoveries_physics_observations,"[matter, discoveries, physics, observations, m...",[There are astrophysical objects named neutron...
2,1,549,1_surface_phenomena_shape_physics,"[surface, phenomena, shape, physics, mathemati...",[This research addresses problems emerging fro...
3,2,461,2_stem education_education research_stem field...,"[stem education, education research, stem fiel...",[Morgan State University proposes to implement...
4,3,369,3_climate_simulations_project use_observations,"[climate, simulations, project use, observatio...",[The geological history of tropical glaciers c...
5,4,352,4_molecular_biology_biological_pathways,"[molecular, biology, biological, pathways, pat...",[One of the great challenges of modern science...
6,5,341,5_regions_depth_ground_structural,"[regions, depth, ground, structural, boundarie...",[There is a 99.7% chance a magnitude 6.7 earth...
7,6,338,6_ecosystem_climate_chemistry_chemical,"[ecosystem, climate, chemistry, chemical, post...",[Dissolved organic carbon (DOC) is a key compo...
8,7,296,7_species_ecosystem_biology_biological,"[species, ecosystem, biology, biological, evol...",[Species are a fundamental unit of diversity i...
9,8,251,8_machine learning_deep learning_algorithms_da...,"[machine learning, deep learning, algorithms, ...",[Exciting empirical breakthroughs have emerged...


In [None]:
embeddings = embedding_model.encode(abstracts, show_progress_bar=True)

Batches:   0%|          | 0/412 [00:00<?, ?it/s]

In [None]:
# Reduce outliers with pre-calculate embeddings instead
new_topics = topic_model.reduce_outliers(abstracts, topics, strategy="embeddings", embeddings=embeddings)
topic_model.update_topics(abstracts, topics=new_topics)

In [None]:
topic_model.visualize_topics()

In [None]:
topic_model.visualize_hierarchy()

In [None]:
umap_model_plot = UMAP(n_neighbors=30, n_components=2, min_dist=0.0, metric="cosine")
reduced_embeddings = umap_model_plot.fit_transform(embeddings)

In [None]:
topic_model.visualize_documents(
    abstracts, reduced_embeddings=reduced_embeddings, hide_annotations=True, sample = 0.1
)