<div style="text-align:center;">
  <img src="images/molssi_main_horizontal.png" style="display: block; margin: 0 auto; max-height:200px;">
</div>

# Building a Database with SQLModel

<strong>Author(s):</strong> Jessica A. Nash, The Molecular Sciences Software Institute

<div class="alert alert-block alert-info"> 
<h2>Overview</h2>

<strong>Questions:</strong>


<strong>Objectives:</strong>

</div>



In [None]:
import os

from typing import Optional, List

from sqlmodel import Field, SQLModel, Session, Relationship, create_engine

# remove the database file if it exists
if os.path.exists("sqlmodel_database.db"):
    os.remove("sqlmodel_database.db")


# Define associative tables first - we will use these in relationships for our main tables
# It's easier to define these first because we can use them in the main tables' definitions,
# otherwise, Python would have a problem and we would need to use something like ForwardRef (probably, I actually didn't get that to work :) )
class ArticleKeyword(SQLModel, table=True):
    __table_args__ = {"extend_existing": True} # This lets us run the Jupyter notebook cell multiple times without error
    
    article_doi: str = Field(foreign_key="article.doi", primary_key=True)
    keyword_id: str = Field(foreign_key="keyword.id", primary_key=True)

class ArticleAuthor(SQLModel, table=True):
    __table_args__ = {"extend_existing": True}

    article_doi: str = Field(foreign_key="article.doi", primary_key=True)
    author_id: str = Field(foreign_key="author.id", primary_key=True)

class Article(SQLModel, table=True):
    __table_args__ = {"extend_existing": True}

    doi: str = Field(primary_key=True)
    title: str
    publication_year: int
    abstract: Optional[str] = Field(default=None)

    keywords: list["Keyword"] = Relationship(back_populates="articles", link_model=ArticleKeyword)
    authors: list["Author"] = Relationship(back_populates="articles", link_model=ArticleAuthor)

class Author(SQLModel, table=True):
    __table_args__ = {"extend_existing": True}

    id: None | int = Field(primary_key=True)
    first_name: str
    last_name: str
    affiliation: Optional[str] = Field(default=None)

    articles: List["Article"] = Relationship(back_populates="authors", link_model=ArticleAuthor)


class Keyword(SQLModel, table=True):
    __table_args__ = {"extend_existing": True}

    id: None | int = Field(primary_key=True)
    keyword: str = Field(unique=True, index=True)

    articles: List["Article"] = Relationship(back_populates="keywords", link_model=ArticleKeyword)



In [None]:
sqlite_file_name = "sqlmodel_database.db"
sqlite_url = f"sqlite:///{sqlite_file_name}"

engine = create_engine(sqlite_url)

In [None]:
SQLModel.metadata.create_all(engine)

In [None]:
import requests

import datetime

# most recent theoretical chemistry paper on ChemRXiv
paper = requests.get("https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?categoryIds=605c72ef153207001f6470ce&limit=1")
print(paper.json()["itemHits"])

In [None]:
recent_paper ={
    "doi": paper.json()["itemHits"][0]["item"]["doi"],
    "title": paper.json()["itemHits"][0]["item"]["title"],
    # get the current year - making some assumptions here :)
    "publication_year": datetime.datetime.now().year,
    "abstract": paper.json()["itemHits"][0]["item"]["abstract"],
    "keywords": paper.json()["itemHits"][0]["item"]["keywords"],
    "authors": paper.json()["itemHits"][0]["item"]["authors"]
}

print(recent_paper)

In [None]:
keyword_objs = []
for keyword in recent_paper["keywords"]:
    keyword_obj = Keyword(keyword=keyword.lower())
    keyword_objs.append(keyword_obj)

author_objs = []
for author in recent_paper["authors"]:
    author_obj = Author(first_name=author["firstName"], last_name=author["lastName"], affiliation=author["institutions"][0]["name"])
    author_objs.append(author_obj)

recent_paper["keywords"] = keyword_objs
recent_paper["authors"] = author_objs

In [None]:
recent_paper["keywords"]

In [None]:
with Session(engine) as session:
    # Add the article
    article = Article(**recent_paper)
    session.add(article)
    session.commit()

In [None]:
# Show how to query here.

Let's pull 50 more papers from ChemArxiv and add them to our database.

In [None]:
import requests
import datetime
from sqlmodel import select, Session

# Get the most recent theoretical chemistry papers on ChemRxiv
papers = requests.get("https://chemrxiv.org/engage/chemrxiv/public-api/v1/items?categoryIds=605c72ef153207001f6470ce&limit=50&skip=1")

for paper in papers.json()["itemHits"]:
    recent_paper = {
        "doi": paper["item"]["doi"],
        "title": paper["item"]["title"],
        "publication_year": datetime.datetime.now().year,
        "abstract": paper["item"]["abstract"],
        "keywords": paper["item"]["keywords"],
        "authors": paper["item"]["authors"]
    }

    keyword_objs = []
    with Session(engine) as session:
        for keyword in recent_paper["keywords"]:
            # Check if keyword already exists
            normalized_keyword = keyword.lower()
            existing_keyword = session.exec(select(Keyword).where(Keyword.keyword == normalized_keyword)).first()
            if existing_keyword:
                keyword_objs.append(existing_keyword)
            else:
                keyword_obj = Keyword(keyword=normalized_keyword)
                session.add(keyword_obj)
                session.commit()  # Commit to get the keyword ID
                session.refresh(keyword_obj)  # Refresh to load the keyword ID
                keyword_objs.append(keyword_obj)

    author_objs = []
    with Session(engine) as session:
        for author in recent_paper["authors"]:
            affiliation = author["institutions"][0]["name"] if author["institutions"] else None

            # Check if author already exists
            existing_author = session.exec(
                select(Author).where(
                    Author.first_name == author["firstName"],
                    Author.last_name == author["lastName"],
                    Author.affiliation == affiliation
                )
            ).first()
            
            if existing_author:
                author_objs.append(existing_author)
            else:
                author_obj = Author(
                    first_name=author["firstName"],
                    last_name=author["lastName"],
                    affiliation=affiliation
                )
                session.add(author_obj)
                session.commit()  # Commit to get the author ID
                session.refresh(author_obj)  # Refresh to load the author ID
                author_objs.append(author_obj)

    recent_paper["keywords"] = keyword_objs
    recent_paper["authors"] = author_objs

    try:
        with Session(engine) as session:
            # Add the article
            article = Article(**recent_paper)
            session.add(article)
            session.commit()
    except Exception as e:
        print(f"Adding article {recent_paper['doi']} failed with error: {e}")


In [None]:
keyword_to_search = "machine learning"

with Session(engine) as session:
    # Get the specific Keyword object by keyword
    keyword = session.exec(select(Keyword).where(Keyword.keyword == keyword_to_search)).first()
    
    if keyword:
        print(f"Found keyword: {keyword.keyword}")
        # Access related articles directly
        for article in keyword.articles:
            print(f"DOI: {article.doi}, Title: {article.title}, Year: {article.publication_year}")
    else:
        print(f"No articles found for keyword: {keyword_to_search}")
