In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import json
import pandas as pd

# Comparing word embeddings
This approach to comparison is using word embeddings to determine the similarity between the ground truth abstract (from the JSON) and the parsed abstract.

Word embedding workflow and pre trained model sourced from the Hugging Face "transformers" library.

### Step one: create word embeddings for the abstracts

In [4]:
#step one: set up for creating embeddings-- loads pre-trained model.
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [5]:
#gets the embedding for each text.
def get_embedding(text):
    # first, tokenize
    tokens = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # then get embeddings
    with torch.no_grad():
        outputs = model(**tokens)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()

In [6]:
ground_text = "This study presents an analytical approach to sector rotation, leveraging both factor models and fundamental metrics. We initiate with a systematic classification of sectors, followed by an empirical investigation into their returns. Through factor analysis, the paper underscores the significance of momentum and short-term reversion in dictating sectoral shifts. A subsequent in-depth fundamental analysis evaluates metrics such as PE, PB, EV-to-EBITDA, Dividend Yield, among others. Our primary contribution lies in developing a predictive framework based on these fundamental indicators. The constructed models, post rigorous training, exhibit noteworthy predictive capabilities. The findings furnish a nuanced understanding of sector rotation strategies, with implications for asset management and portfolio construction in the financial domain." #fill in with abstracts
parsed_text = "This study presents an analytical approach to sector rotation, leveraging both factor models and fundamental metrics. We initiate with a systematic classification of sectors, followed by an empirical investigation into their returns. Through factor analysis, the paper underscores the significance of momentum and short-term reversion in dictating sectoral shifts. A subsequent in-depth fundamental analysis evaluates metrics such as PE, PB, EV-to-EBITDA, Dividend Yield, among others. Our primary contribution lies in developing a predictive framework based on these fundamental indicators. The constructed models, post rigorous training, exhibit noteworthy predictive capabilities. The findings furnish a nuanced understanding of sector rotation strategies, with implications for asset management and portfolio construction in the financial domain."

#get embeddings 
ground_embedding = get_embedding(ground_text)
parsed_embedding = get_embedding(parsed_text)

Now, we have the word embeddings needed for comparisons. 

Note: Here, we've sourced the embeddings using the all-MiniLM-L6-v2 model from Hugging Face. In practice, it may make sense to use a more specialized model, such as one trained specifically on academic language.

### Step two: analyze similarities
In this case, we're using cosine similarity to assess the similarity between the two embeddings.

In [7]:
# get cosine similarity
similarity = 1 - cosine(ground_embedding, parsed_embedding)
similarity

np.float32(1.0)