load the concept and document data

In [None]:
from pathlib import Path

from tqdm.auto import tqdm

from src.concept import Concept
from src.document import Document

In [None]:
data_dir = Path("../data/processed")

document_dir = data_dir / "documents"
document_files = list(document_dir.glob("*.json"))
documents = [Document.load(file) for file in tqdm(document_files)]

concept_dir = data_dir / "concepts"
concept_files = list(concept_dir.glob("*.json"))
concepts = [Concept.load(file) for file in tqdm(concept_files)]

create a mapping from concept id to preferred label

In [None]:
concept_id_to_label = {concept.id: concept.preferred_label for concept in concepts}

count instances of each concept being found in the document data - which are most and least common?

In [None]:
from collections import Counter

all_found_concepts = Counter(
    [
        concept_id_to_label[concept_id]
        for document in documents
        for concept_id in document.concepts
    ]
)
all_found_concepts

create a dataframe with the document-wise counts of each concept

In [None]:
import pandas as pd

In [None]:
document_concept_counts = {
    document.id: dict(
        Counter([concept_id_to_label[concept_id] for concept_id in document.concepts])
    )
    for document in documents
}

df = pd.DataFrame(document_concept_counts).T.fillna(0).astype(int)
df

show the cooccurrence matrix of the concepts in documents

In [None]:
concept_interactions = df.T.dot(df)
concept_interactions

normalise the concept interactions and then plot a heatmap


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
normalised_concept_interactions = concept_interactions / np.sqrt(
    concept_interactions.values.diagonal()
)

plt.figure(figsize=(12, 12))

sns.heatmap(normalised_concept_interactions, cmap="viridis")
plt.show()

find the sentences which contain concepts, embed them, and plot the 2d UMAP of the embeddings


In [None]:
import pandas as pd
import plotly.express as px
import torch
from transformers import AutoModel, AutoTokenizer
from umap import UMAP

In [None]:
sentences = set()
for document in tqdm(documents):
    for concept_span in document.concept_spans:
        for sentence_span in document.sentence_spans:
            if (
                concept_span.start_index >= sentence_span.start_index
                and concept_span.end_index <= sentence_span.end_index
            ):
                sentences.add(
                    document.text[sentence_span.start_index : sentence_span.end_index]
                )

In [None]:
sentences = list(sentences)
len(sentences)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

model.eval()

sentence_embeddings = []
for sentence in tqdm(sentences[:5000]):
    with torch.no_grad():
        inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)
        sentence_embeddings.append(outputs.pooler_output)

In [None]:
embeddings = torch.stack(sentence_embeddings).squeeze().numpy()

In [None]:
umap = UMAP(n_components=2)
embeddings_2d = umap.fit_transform(embeddings)

In [None]:
df = pd.DataFrame(
    {
        "x": embeddings_2d[:, 0],
        "y": embeddings_2d[:, 1],
        "sentence": sentences[:5000],
    }
)
fig = px.scatter(
    df, x="x", y="y", hover_data={"x": False, "y": False, "sentence": True}
)
fig.show()