# Setup

## Packages

In [1]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
import time

# for using configuration files like yamls. This is to help key our keys safe
import yaml # for configuration files 
import hydra
from omegaconf import DictConfig, OmegaConf

import multiprocessing
import requests

import urllib
import os
import json


# unzip files
import gzip
import shutil

# @hydra.main(config_path="../conf", config_name="main", version_base=None)

In [4]:
with open("../config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)

root_path = config['paths']['root']
mount_path = os.path.join(root_path, "mount-folder")
working_path = os.path.join(root_path, 'therapeutic_accelerator/db_work')

In [3]:
from semanticscholar import SemanticScholar
sch = SemanticScholar(api_key=keys['s2_api_key'])

## Functions

In [None]:
# download paper
def get_paper(url, file_path): 
    """ url to the file and the file_name to download it as """
    if os.path.isfile(file_path) == False:
            try: 
                urllib.request.urlretrieve(url, file_path)
                urllib.request.urlcleanup()
            except: 
                print(f"Error for {file_path}")
        else: 
            print(f"{i}, File Exists")
    

def unzip_file(zip_file_path, ext_file_path): 
    """ make sure the current working directory is set to where you want the files """
    if os.path.isfile(ext_file_path) == False:
        try: 
            with gzip.open(zip_file_path, 'rb') as f_in:
                with open(ext_file_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        except: 
            print(f"Was not able to extract file {ext_file_path}")
    else:
        print(f"{i}, File Exists")

In [None]:
# Multithreading
def download(link, filelocation):
    r = requests.get(link, stream=True)
    with open(filelocation, 'wb') as f:
        for chunk in r.iter_content(1024):
            if chunk:
                f.write(chunk)

def createNewDownloadThread(link, filelocation):
    download_thread = threading.Thread(target=download, args=(link,filelocation))
    download_thread.start()

# for i in range(0,5):
#     file = "C:\\test" + str(i) + ".png"
#     print file
#     createNewDownloadThread("http://stackoverflow.com/users/flair/2374517.png", file)

## Get Papers and latests releases

In [None]:
# Get info about the papers dataset
papers = requests.get(config['semantic_scholar']['papers'],
                      headers={'x-api-key':keys['x-api-key']}).json()

In [None]:
# Get info about the latest release
latest_release = requests.get(config['semantic_scholar']['latest']).json()

# Get info about past releases
dataset_ids = requests.get(config['semantic_scholar']['release']).json()
earliest_release = requests.get(f"http://api.semanticscholar.org/datasets/v1/release/{dataset_ids[0]}").json()

## Download Files
Create the file paths for the zipped and extracted

In [None]:
download_flag = False
extract_flag = False

In [None]:
# create base file names
file_names = [f"papers-part{n}.jsonl.gz" for n in range(len(papers['files']))]

# create zipped file paths
paper_zip = [os.path.join(mount_path, "zipped", f) for f in file_names]

# create extracted file paths
paper_fn = [os.path.join(mount_path, "extracted", f.strip("\.gz")) for f in file_names]

In [None]:
# Takes ~30 minutes
if download_flag == True: 
    with multiprocessing.Pool() as pool:
        pool.starmap(unzip_file, zip(papers['files'], paper_zip), chunksize=10)
    # downloads the files directly into the google cloud bucket
    for i, (url, file_path) in tqdm(enumerate(zip(papers['files'], paper_zip))): 
        if os.path.isfile(file_path) == False:
            try: 
                get_paper(url, file_path)
            except: 
                print(f"Error for {file_path}")
        else: 
            print(f"{i}, File Exists")

In [None]:
## Multiprocessing to extract multiple files at once
# Takes ~FOREVER minutes
if extract_flag == True: 
    with multiprocessing.Pool() as pool:
        pool.starmap(unzip_file, zip(paper_zip, paper_fn), chunksize=3)

In [None]:
# Takes ~ FOREVER mins
if extract_flag == True: 
    for tqdm(file_path) in paper_zip:
        if os.path.isfile(file_path) == False:
            unzip_file(file_path)
        else: 
            print(f"{i}, File Exists")

## Parse JSON file to upload into database

In [5]:
import glob
attribute_files = glob.glob("".join([mount_path, '/extracted/*?.jsonl']))
attribute_files

['/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part0.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part1.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part10.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part11.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part12.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part2.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part20.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part21.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part22.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part3.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part4.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part5.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part6.jsonl',
 '/home/nick_lee_berkeley_edu/mo

In [23]:
def json_to_df(j): 
    """ Create dataframe to upload into database """
    return pd.DataFrame([json.loads(j)])

## Upload Data to Postgres DB

### Create connection to database

In [10]:
# create connection and add "pool" object to environment. Used for connecting to DB
exec(open(os.path.join(working_path, 'sql_db_connection.py')).read())
pool

Engine(postgresql+pg8000://)

In [None]:
meta = MetaData()

articles = Table(
    'article_attributes', meta,
    Column('corpusid', Integer, primary_key = True),
    Column('externalids', String),
    Column('url', String),
    Column('title', Integer),
    Column('authors', ARRAY(JSONB)),
    Column('venue', String),
    Column('publicationvenueid', Integer),
    Column('year', String),
    Column('referencecount', Integer),
    Column('citationcount', Integer),
    Column('influentialcitationcount', Integer),
    Column('isopenaccess', Boolean),
    Column('s2fieldsofstudy', ARRAY(JSONB)),
    Column('publicationtypes', Integer),
    Column('publicationdate', Integer),
    Column('journal', JSONB),
    Column('updated', String)    
)

# create table in database
meta.create_all(pool)

In [7]:
def df_to_db(df): 
    with pool.connect() as db_conn:
        df.to_sql('article_attributes', con = db_conn, if_exists='append', index = False)

In [1]:
import orjson # for faster reading of json
import jsonlines # for opening jsonl files

In [9]:
def preprocess_df(df): 
    df.year = df.year.astype("Int64")
    return df

In [None]:
for jl in tqdm(attribute_files): 
    with jsonlines.open(jl) as f:
        for line in tqdm(f.iter()): 
            df_to_db(preprocess_df(pd.DataFrame([line]))) # reads json, converts to dataframe, preprocess functions and appends results to database

In [11]:
# test connection 
# connect to connection pool
with pool.connect() as db_conn:
    # create ratings table in our sandwiches database
    results = db_conn.execute(sqlalchemy.text("SELECT * FROM article_attributes LIMIT 10")).fetchall()
    
    # # show results
    # for row in results:
    #     print(row)

In [13]:
pd.DataFrame(results)

Unnamed: 0,corpusid,externalids,url,title,authors,venue,publicationvenueid,year,referencecount,citationcount,influentialcitationcount,isopenaccess,s2fieldsofstudy,publicationtypes,publicationdate,journal,updated
0,208034569,"{""ACL"": null, ""DBLP"": ""conf/igarss/Zakharova19...",https://www.semanticscholar.org/paper/bd4f0fb0...,Changes of Scattering Mechanisms in Boreal For...,"{""{\""authorId\"": \""144748888\"", \""name\"": \""L....",IEEE International Geoscience and Remote Sensi...,a47b9394-c5c7-4bc8-b8fc-b08f96954278,2019,9,0,0,False,"{""{\""category\"": \""Environmental Science\"", \""...",{JournalArticle},2019-07-01,"{""name"": ""IGARSS 2019 - 2019 IEEE Internationa...",2023-01-21T03:56:34.542Z
1,30128537,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/55bd2815...,Dermacase. Milia.,"{""{\""authorId\"": \""2070509491\"", \""name\"": \""T...",Canadian family physician Medecin de famille c...,,1998,0,0,0,False,,"{JournalArticle,CaseReport}",,"{""name"": ""Canadian family physician Medecin de...",2023-01-21T03:56:31.171Z
2,245268758,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/e0548825...,The Rights of the Elderly,"{""{\""authorId\"": \""2145489026\"", \""name\"": \""H...",China’s Path of Human Rights Development,,2021,0,0,0,False,,,2021-08-05,"{""name"": ""China\u2019s Path of Human Rights De...",2023-01-21T03:56:33.199Z
3,34122848,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/6a47d6ea...,[Efficacy of human milk].,"{""{\""authorId\"": \""2115070676\"", \""name\"": \""E...",Münchener medizinische Wochenschrift,25310608-131d-49b5-93f8-7e0174df9c1b,1950,0,0,0,False,,{JournalArticle},1950-06-23,"{""name"": ""Munchener medizinische Wochenschrift...",2023-01-21T03:56:33.201Z
4,29204384,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/83ec4fdf...,[COMPARATIVE CLINICAL EVALUATION OF VARIOUS AN...,"{""{\""authorId\"": \""102877137\"", \""name\"": \""N....",Vestnik otorinolaringologii,ab42ceb6-7f06-468e-82cf-72fedf491358,1965,0,0,0,False,"{""{\""category\"": \""Medicine\"", \""source\"": \""s...",{JournalArticle},,"{""name"": ""Vestnik otorinolaringologii"", ""pages...",2023-01-21T03:57:18.513Z
5,9562,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/1e1ca5b5...,[Changes in the myocardium following multiple ...,"{""{\""authorId\"": \""13058405\"", \""name\"": \""V. ...",Kardiologiia,2033ee0d-05f4-4055-b85d-ea080e0b37e6,1968,0,0,0,False,"{""{\""category\"": \""Medicine\"", \""source\"": \""s...",{JournalArticle},,"{""name"": ""Kardiologiia"", ""pages"": ""\n ...",2023-01-21T03:57:19.030Z
6,109830986,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/8e2e6a69...,Capacitance Voltage Characteristics Of Stearic...,"{""{\""authorId\"": \""2068848556\"", \""name\"": \""S...",1991 Annual Report. Conference on Electrical I...,,1991,8,0,0,False,"{""{\""category\"": \""Physics\"", \""source\"": \""s2...",{Conference},1991-10-20,"{""name"": ""1991 Annual Report. Conference on El...",2023-01-21T03:57:19.048Z
7,238781607,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/69cbf6dd...,O perfil do profissional fisioterapeuta atuant...,"{""{\""authorId\"": \""74717662\"", \""name\"": \""Jos...","Research, Society and Development",9a981482-f104-474d-8fcd-f5d55cf4aace,2021,16,1,0,False,,,2021-08-02,"{""name"": ""Research, Society and Development"", ...",2023-01-21T03:57:19.106Z
8,226239949,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/104e74b9...,Letter to the editor following “Gastroesophage...,"{""{\""authorId\"": \""14041115\"", \""name\"": \""A. ...",Obesity Surgery,40bf9142-00ba-45cf-b914-6575c40bb357,2020,7,0,0,False,"{""{\""category\"": \""Medicine\"", \""source\"": \""s...","{LettersAndComments,Review}",2020-11-03,"{""name"": ""Obesity Surgery"", ""pages"": ""1846 - 1...",2023-01-21T03:57:19.642Z
9,245947163,"{""ACL"": null, ""DBLP"": null, ""ArXiv"": null, ""MA...",https://www.semanticscholar.org/paper/d791f96a...,Verbal Representation of Emotions in Komi Comm...,"{""{\""authorId\"": \""120660117\"", \""name\"": \""N....",Human. Culture. Education,9d5b8a82-7d39-415e-84ed-5db8e10251ea,2021,0,0,0,False,,,,"{""name"": ""Human Culture Education"", ""pages"": n...",2023-01-21T03:56:34.498Z


In [None]:
import os
import json
from zipfile import ZipFile

import pandas as pd
import numpy as np

In [None]:
z_files = []
for root, dirs, files in os.walk(mount_path, topdown=False):
    for name in files:
        z_files.append(os.path.join(root, name))

In [None]:
import gzip
import shutil
with gzip.open(z_files[0], 'rb') as f:
    file_content = f.read()

In [None]:
# get embeddings
from typing import Dict, List
import json

import requests


URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


SAMPLE_PAPERS = [
    {
        "paper_id": "A",
        "title": "Angiotensin-converting enzyme 2 is a functional receptor for the SARS coronavirus",
        "abstract": "Spike (S) proteins of coronaviruses ...",
    },
    {
        "paper_id": "B",
        "title": "Hospital outbreak of Middle East respiratory syndrome coronavirus",
        "abstract": "Between April 1 and May 23, 2013, a total of 23 cases of MERS-CoV ...",
    },
]


def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id

if __name__ == "__main__":
    all_embeddings = embed(SAMPLE_PAPERS)

    # Prints { 'A': [4.089589595794678, ...], 'B': [-0.15814849734306335, ...] }
    print(all_embeddings)