Get journal categories from [Google Scholar Top Venues](https://scholar.google.com/citations?view_op=top_venues)

In [1]:
import json
from pathlib import Path

from scholarly import scholarly

google_scholar_dir = Path("data/google_scholar/")
google_scholar_dir.mkdir(exist_ok=True)

# journal_categories will fetch online data so it takes some time
categories = list(scholarly.journal_categories.keys())
sub_categories = list(
    set(
        sub_cat
        for cat_dict in scholarly.journal_categories.values()
        for sub_cat in cat_dict.keys()
        if sub_cat is not None
    )
)

cat_path = google_scholar_dir / "categories.json"
sub_cat_path = google_scholar_dir / "sub_categories.json"

with cat_path.open("w") as cat_f:
    json.dump(categories, cat_f, indent=2)
with sub_cat_path.open("w") as sub_cat_f:
    json.dump(sub_categories, sub_cat_f, indent=2)


Generate embeddings for each category

In [14]:
import torch

from sentence_transformers import SentenceTransformer
from sentence_transformers.util import semantic_search

with cat_path.open() as cat_f:
    categories = json.load(cat_f)
with sub_cat_path.open() as sub_cat_f:
    sub_categories = json.load(sub_cat_f)

st_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
cat_embeddings = st_model.encode(categories, convert_to_tensor=True)
sub_cat_embeddings = st_model.encode(sub_categories, convert_to_tensor=True)

torch.save(sub_cat_embeddings, google_scholar_dir / "sub_cat_embeddings.pt")


Test query search

In [12]:
query = "robotic in Healthcare"
query_embedding = st_model.encode([query], convert_to_tensor=True)
cat_matches = semantic_search(query_embedding, cat_embeddings, top_k=2)[0]
sub_cat_matches = semantic_search(query_embedding, sub_cat_embeddings, top_k=4)[0]

for match in cat_matches:
    print(categories[match["corpus_id"]], match["score"])

for match in sub_cat_matches:
    print(sub_categories[match["corpus_id"]], match["score"])


Health & Medical Sciences 0.29512783885002136
Physics & Mathematics 0.1700638234615326
Robotics 0.7121439576148987
Biomedical Technology 0.4736882150173187
Medical Informatics 0.4277925491333008
Orthopedic Medicine & Surgery 0.4193120300769806
