In [233]:
import faiss
from sentence_transformers import SentenceTransformer

import numpy as np
import pandas as pd

import os
from dotenv import load_dotenv

import json

In [122]:
load_dotenv()

True

In [None]:
encoder = SentenceTransformer('all-mpnet-base-v2', device='cpu')

In [267]:
# import article titles from metadata

proj_home = os.getenv("PROJ_HOME")
metadata_filepath = os.path.join(proj_home, "data/data/subject_matter/advanced_metadata.json")

with open(metadata_filepath, "r") as f:
 metadata = json.load(f)

article_titles = list(set([item["title"] for item in metadata]))
article_titles_df = pd.DataFrame(data=article_titles, columns=['title'])

In [241]:
# get title embeddings

title_embeddings = encoder.encode(article_titles, device='cpu')

In [263]:
# build FAISS index

dim = 768
index = faiss.IndexFlatL2(dim)
index.add(title_embeddings)

In [275]:
# get article matches for a given subject

def get_article_matches(subject, k=5):
 search_text = [subject]
 search_embedding = encoder.encode(search_text, device='cpu')
 dists, ids = index.search(search_embedding, k=k)
 dists = np.around(np.clip(dists, 0, 1), decimals=4)
 results = pd.DataFrame({'distance': dists[0], 'index': ids[0]})
 merge = pd.merge(results, article_titles_df, left_on='index', right_index=True)
 matches = merge['title'].tolist()
 return matches

In [288]:
get_article_matches("science")

['science', 'scientific theory', 'zoology', 'Big Science', 'physical science']