In [1]:
import pandas as pd
import os
from tqdm import tqdm
import yaml
from sqlalchemy import create_engine
from sqlalchemy.sql import text
import csv
import time
import numpy as np


with open("/home/ubuntu/work/therapeutic_accelerator/config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)
    
with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)
    
bucket_path = os.path.join(config['paths']['root'], config['paths']['mount'])

from typing import TypeVar
T = TypeVar('T')

In [2]:
## Create the sql database connection
# 'postgresql://<username>:<password@<host>:<port>/postgres'
engine = create_engine(f'postgresql://postgres:{keys["postgres"]}@{config["database"]["host"]}:5432/postgres')

# Query to pull in corpus IDs

sql = '''
    SELECT * FROM attributes LEFT JOIN abstracts ON corpusId;
'''

# Context manager to open the connection to the database and execute the query. 
with engine.connect() as conn:
    query = conn.execute(text(sql))
    
# Get all records and store in dataframe
df = pd.DataFrame(query.fetchall())

In [46]:
from semanticscholar import SemanticScholar

# Create connection to Semantic Scholar
sch = SemanticScholar(api_key=keys['semantic'])

In [6]:
def batched(items: list[T], batch_size: int) -> list[T]:
    """Create batched list for api calls broken into 

    Args:
        items (list[T]): _description_
        batch_size (int): _description_

    Returns:
        list[T]: Batched list 
    """
    return [items[i:i + batch_size] for i in range(0, len(items), batch_size)]

In [140]:
# Takes about 30 minutes to run all 700K ids
with open('papers_abstracts.csv', 'w') as csvfile:
    fieldnames = ['paperId', 'corpusId', 'abstract']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    # ids = [f'PMID:{pmid}' for pmid in pmids]
    # fields = 'corpusId,abstract,pmid'
    batch_size = 500
    
    # The `tqdm` wrapper should display a progress bar for the loop
    for ids_batch in tqdm(batched(ids, batch_size=batch_size), desc = 'batch'):
        # Get the papers based off UUID in batches, convert to dataframe for DB uploading
        results = pd.DataFrame([dict(i) for i in sch.get_papers(ids_batch, fields=fieldnames)])
        
        # set corpus id as the index for quicker lookup times
        results = results.set_index('corpusId')

        # Create `abstracts` table within our postgres DB. Watch for replace if we are updating the data table
        with engine.connect() as conn:
            results.to_sql('abstracts', con=conn, if_exists='append', index=True, chunksize = 500)
        
        # appends results to csv file. Save as backup incase DB does not work properly
        results.to_csv(csvfile, mode ='a', header=False)

        # sleep to avoid hiting request limits of 5000 requests per five minutes
        # time.sleep(5)

batch:   0%|          | 0/1460 [00:00<?, ?it/s]

batch: 100%|██████████| 1460/1460 [32:03<00:00,  1.32s/it]


# Test that it uploaded

Check to make sure DB creation worked

In [141]:
sql = '''
    SELECT * FROM abstracts;
'''

with engine.connect() as conn:
    query = conn.execute(text(sql))   
          
df = pd.DataFrame(query.fetchall())

df.shape

(729817, 4)

In [144]:
# backup abstracts DB as CSV
df.to_csv("/home/ubuntu/work/therapeutic_accelerator/db_work/papers_abstracts_backup.csv")

Check connection between attributes and abstracts