# Setup

## Packages

In [39]:
import pandas as pd
import numpy as np
import yaml
from tqdm import tqdm
import time

# for using configuration files like yamls. This is to help key our keys safe
import hydra
from omegaconf import DictConfig, OmegaConf

import multiprocessing
import requests

import urllib
import os
import json


# unzip files
import gzip
import shutil

# @hydra.main(config_path="../conf", config_name="main", version_base=None)

In [None]:
from semanticscholar import SemanticScholar
sch = SemanticScholar(api_key=keys['s2_api_key'])

In [9]:
with open("../config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

print(config)

with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)

root_path = "/home/nick_lee_berkeley_edu/"
mount_path = os.path.join(root_path, "mount-folder")

{'project_id': 'w210-capstone-ta', 'instance': {'name': 'w210-therapeutic-accelerator', 'region': 'us-central1'}, 'database': {'user': 'postgres', 'password': 'Q]-/difiAH57&pt.', 'name': 'articles'}, 'semantic_scholar': {'latest': 'http://api.semanticscholar.org/datasets/v1/release/latest', 'release': 'http://api.semanticscholar.org/datasets/v1/release', 'papers': 'http://api.semanticscholar.org/datasets/v1/release/latest/dataset/papers'}, 'paths': {'root': '/home/nick_lee_berkeley_edu/', 'mount': 'mount-folder'}}


# Functions

In [42]:
# download paper
def get_paper(url, file_path): 
    """ url to the file and the file_name to download it as """
    if os.path.isfile(file_path) == False:
            try: 
                urllib.request.urlretrieve(url, file_path)
                urllib.request.urlcleanup()
            except: 
                print(f"Error for {file_path}")
        else: 
            print(f"{i}, File Exists")
    

def unzip_file(zip_file_path, ext_file_path): 
    """ make sure the current working directory is set to where you want the files """
    if os.path.isfile(ext_file_path) == False:
        try: 
            with gzip.open(zip_file_path, 'rb') as f_in:
                with open(ext_file_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        except: 
            print(f"Was not able to extract file {ext_file_path}")
    else:
        print(f"{i}, File Exists")

In [None]:
# Multithreading
def download(link, filelocation):
    r = requests.get(link, stream=True)
    with open(filelocation, 'wb') as f:
        for chunk in r.iter_content(1024):
            if chunk:
                f.write(chunk)

def createNewDownloadThread(link, filelocation):
    download_thread = threading.Thread(target=download, args=(link,filelocation))
    download_thread.start()

# for i in range(0,5):
#     file = "C:\\test" + str(i) + ".png"
#     print file
#     createNewDownloadThread("http://stackoverflow.com/users/flair/2374517.png", file)

# Get Papers and latests releases

In [4]:
# Get info about the papers dataset
papers = requests.get(config['semantic_scholar']['papers'],
                      headers={'x-api-key':keys['x-api-key']}).json()

In [1]:
# Get info about the latest release
latest_release = requests.get(config['semantic_scholar']['latest']).json()

# Get info about past releases
dataset_ids = requests.get(config['semantic_scholar']['release']).json()
earliest_release = requests.get(f"http://api.semanticscholar.org/datasets/v1/release/{dataset_ids[0]}").json()

# Download Files
Create the file paths for the zipped and extracted

In [32]:
download_flag = True
extract_flag = False

In [28]:
# create base file names
file_names = [f"papers-part{n}.jsonl.gz" for n in range(len(papers['files']))]

# create zipped file paths
paper_zip = [os.path.join(mount_path, "zipped", f) for f in file_names]

# create extracted file paths
paper_fn = [os.path.join(mount_path, "extracted", f.strip("\.gz")) for f in file_names]

In [36]:
# Takes ~30 minutes
if download_flag == True: 
    with multiprocessing.Pool() as pool:
        pool.starmap(unzip_file, zip(papers['files'], paper_zip), chunksize=10)
    # downloads the files directly into the google cloud bucket
    for i, (url, file_path) in tqdm(enumerate(zip(papers['files'], paper_zip))): 
        if os.path.isfile(file_path) == False:
            try: 
                get_paper(url, file_path)
            except: 
                print(f"Error for {file_path}")
        else: 
            print(f"{i}, File Exists")

0it [00:00, ?it/s]

0, File Exists
1, File Exists
2, File Exists
3, File Exists
4, File Exists
5, File Exists
6, File Exists
7, File Exists
8, File Exists
9, File Exists
10, File Exists
11, File Exists
12, File Exists
13, File Exists
14, File Exists
15, File Exists
16, File Exists
17, File Exists
18, File Exists
19, File Exists
20, File Exists
21, File Exists
22, File Exists
23, File Exists
24, File Exists
25, File Exists
26, File Exists


28it [00:00, 93.47it/s]

Error for /home/nick_lee_berkeley_edu/mount-folder/zipped/papers-part27.jsonl.gz
Error for /home/nick_lee_berkeley_edu/mount-folder/zipped/papers-part28.jsonl.gz


30it [00:00, 32.07it/s]

Error for /home/nick_lee_berkeley_edu/mount-folder/zipped/papers-part29.jsonl.gz





In [None]:
## Multiprocessing to extract multiple files at once
# Takes ~FOREVER minutes
with multiprocessing.Pool() as pool:
    pool.starmap(unzip_file, zip(paper_zip, paper_fn), chunksize=3)

In [None]:
# Takes ~ FOREVER mins
if extract_flag == True: 
    for tqdm(file_path) in paper_zip:
        if os.path.isfile(file_path) == False:
            unzip_file(file_path)
        else: 
            print(f"{i}, File Exists")

## Parse JSON file to upload into database

In [8]:
# takes a long time to load
with open('papers-part0.jsonl', 'r') as json_file:
    json_list = list(json_file)

In [10]:
# turn into json object
test = json.loads(json_list[0])
test2 = json.loads(json_list[1])

test.keys()

dict_keys(['corpusid', 'externalids', 'url', 'title', 'authors', 'venue', 'publicationvenueid', 'year', 'referencecount', 'citationcount', 'influentialcitationcount', 'isopenaccess', 's2fieldsofstudy', 'publicationtypes', 'publicationdate', 'journal', 'updated'])

In [11]:
pd.DataFrame([test2])

Unnamed: 0,corpusid,externalids,url,title,authors,venue,publicationvenueid,year,referencecount,citationcount,influentialcitationcount,isopenaccess,s2fieldsofstudy,publicationtypes,publicationdate,journal,updated
0,158371356,"{'ACL': None, 'DBLP': None, 'ArXiv': None, 'MA...",https://www.semanticscholar.org/paper/8100364c...,Education and the Non-financial Employment Com...,"[{'authorId': '121045716', 'name': 'Simona Dem...",,,2018,21,2,0,False,"[{'category': 'Economics', 'source': 's2-fos-m...",,2018-11-01,"{'name': 'Social Indicators Research', 'pages'...",2022-12-24T20:13:29.662Z


## Upload Data to Postgres DB

In [10]:
# connection libraries
from google.cloud.sql.connector import Connector
import sqlalchemy
import pg8000

In [None]:
INSTANCE_CONNECTION_NAME = f"{config['project_id']}:{config['instance']['region']}:{config['instance']['name']}" # i.e demo-project:us-central1:demo-instance
INSTANCE_CONNECTION_NAME

In [47]:
# Add files to database
# initialize Connector object
connector = Connector()

# function to return the database connection object
# @hydra.main(config_path="../conf", config_name="main", version_base=None)
def getconn():
    INSTANCE_CONNECTION_NAME = f"{config['project_id']}:{config['instance']['region']}:{config['instance']['name']}" # i.e demo-project:us-central1:demo-instance
    DB_USER = config['database']['user']
    DB_PASS = config['database']['password']
    DB_NAME = config['database']['name']
    
    conn = connector.connect(
        INSTANCE_CONNECTION_NAME,
        "pg8000",
        user=DB_USER,
        password=DB_PASS,
        db=DB_NAME
    )
    return conn

In [52]:
# create connection pool with 'creator' argument to our connection object function
pool = sqlalchemy.create_engine(
    "postgresql+pg8000://",
    creator=getconn(),
)

ClientResponseError: 404, message='Not Found', url=URL('https://sqladmin.googleapis.com/sql/v1beta4/projects/w210-capstone-ta/instances/w210-therapeutic-accelerator/connectSettings')

['w210-capstone-ta:us-central1:w210-therapeutic-accelerator']: An error occurred while performing refresh. Scheduling another refresh attempt immediately
Traceback (most recent call last):
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/google/cloud/sql/connector/instance.py", line 388, in _refresh_task
    refresh_data = await refresh_task
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/google/cloud/sql/connector/instance.py", line 312, in _perform_refresh
    metadata = await metadata_task
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/google/cloud/sql/connector/refresh_utils.py", line 101, in _get_metadata
    resp = await client_session.get(url, headers=headers, raise_for_status=True)
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/aiohttp/client.py", line 640, in _request
    resp.raise_for_status()
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/

In [None]:
# for context managment
# build connection
def getconn() -> pg8000.connections.Connection:
    with Connector() as connector:
        conn = connector.connect(
            "project:region:instance",
            "pymysql",
            user="my-user",
            password="my-password",
            db="my-db-name"
        )
    return conn

# create connection pool
pool = sqlalchemy.create_engine(
    "mysql+pymysql://",
    creator=getconn,
)

# insert statement
insert_stmt = sqlalchemy.text(
    "INSERT INTO my_table (id, title) VALUES (:id, :title)",
)

# interact with Cloud SQL database using connection pool
with pool.connect() as db_conn:
    # insert into database
    db_conn.execute(insert_stmt, parameters={"id": "book1", "title": "Book One"})

    # commit transaction (SQLAlchemy v2.X.X is commit as you go)
    db_conn.commit()

    # query database
    result = db_conn.execute(sqlalchemy.text("SELECT * from my_table")).fetchall()

    # Do something with the results
    for row in result:
        print(row)

In [None]:
# import os

# from google.cloud.sql.connector import Connector, IPTypes
# import pg8000

# import sqlalchemy


# def connect_with_connector() -> sqlalchemy.engine.base.Engine:
#     """
#     Initializes a connection pool for a Cloud SQL instance of Postgres.

#     Uses the Cloud SQL Python Connector package.
#     """
#     # Note: Saving credentials in environment variables is convenient, but not
#     # secure - consider a more secure solution such as
#     # Cloud Secret Manager (https://cloud.google.com/secret-manager) to help
#     # keep secrets safe.

#     instance_connection_name = os.environ["INSTANCE_CONNECTION_NAME"]  # e.g. 'project:region:instance'
#     db_user = os.environ["DB_USER"]  # e.g. 'my-db-user'
#     db_pass = os.environ["DB_PASS"]  # e.g. 'my-db-password'
#     db_name = os.environ["DB_NAME"]  # e.g. 'my-database'

#     ip_type = IPTypes.PRIVATE if os.environ.get("PRIVATE_IP") else IPTypes.PUBLIC

#     # initialize Cloud SQL Python Connector object
#     connector = Connector()

#     def getconn() -> pg8000.dbapi.Connection:
#         conn: pg8000.dbapi.Connection = connector.connect(
#             instance_connection_name,
#             "pg8000",
#             user=db_user,
#             password=db_pass,
#             db=db_name,
#             ip_type=ip_type,
#         )
#         return conn

#     # The Cloud SQL Python Connector can be used with SQLAlchemy
#     # using the 'creator' argument to 'create_engine'
#     pool = sqlalchemy.create_engine(
#         "postgresql+pg8000://",
#         creator=getconn,
#         # ...
#     )
#     return pool

In [50]:
from sqlalchemy.orm import Session
from sqlalchemy import select

with pool.connect() as db_conn:
    results = db_conn.execute(sqlalchemy.text("SELECT * FROM semantic")).fetchall()

ClientResponseError: 404, message='Not Found', url=URL('https://sqladmin.googleapis.com/sql/v1beta4/projects/w210-capstone-ta/instances/w210-therapeutic-accelerator/connectSettings')

In [38]:
pool

Engine(postgresql+pg8000://)

In [28]:
# test connection 
# connect to connection pool
with pool.connect() as db_conn:
    # create ratings table in our sandwiches database
    results = db_conn.execute(sqlalchemy.text("SELECT * FROM semantic")).fetchall()
    
    # show results
    for row in results:
        print(row)

usage: ipykernel_launcher.py [--help] [--hydra-help] [--version]
                             [--cfg {job,hydra,all}] [--resolve]
                             [--package PACKAGE] [--run] [--multirun]
                             [--shell-completion] [--config-path CONFIG_PATH]
                             [--config-name CONFIG_NAME]
                             [--config-dir CONFIG_DIR]
                             [--experimental-rerun EXPERIMENTAL_RERUN]
                             [--info [{all,config,defaults,defaults-tree,plugins,searchpath}]]
                             [overrides ...]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

In [55]:
# connect to connection pool
with pool.connect() as db_conn:
  # create ratings table in our sandwiches database
  db_conn.execute(
    sqlalchemy.text(
      "CREATE TABLE IF NOT EXISTS semantic "
      "( id SERIAL NOT NULL, "
      " paper_id VARCHAR(255) NOT NULL, "
      " title VARCHAR(255) NOT NULL, "
      " reference_count integer, "
      " citation_count integer, "
      " embeddings double precision[],"
      "PRIMARY KEY (id));"
    )
  )

  # commit transaction (SQLAlchemy v2.X.X is commit as you go)
  db_conn.commit()

  # insert data into our ratings table
  insert_stmt = sqlalchemy.text(
      "INSERT INTO semantic (paper_id, title, reference_count) VALUES (:paper_id, :title, :reference_count)",
  )

  # insert entries into table
  db_conn.execute(insert_stmt, parameters={"paper_id": "649def34f8be52c8b66281af98ae884c09aef38b", "title": "Construction of the Literature Graph in Semantic Scholar", "reference_count": 27})
  db_conn.execute(insert_stmt, parameters={"paper_id": "f712fab0d58ae6492e3cdfc1933dae103ec12d5d", "title": "Reinfection and low cross-immunity as drivers of epidemic resurgence under high seroprevalence: a model-based approach with application to Amazonas, Brazil", "reference_count": 13})

  # commit transactions
  db_conn.commit()

  # query and fetch ratings table
  results = db_conn.execute(sqlalchemy.text("SELECT * FROM semantic")).fetchall()

  # show results
  for row in results:
    print(row)

# cleanup connector object
connector.close()

ClientResponseError: 404, message='Not Found', url=URL('https://sqladmin.googleapis.com/sql/v1beta4/projects/w210-capstone-ta/instances/w210-therapeutic-accelerator/connectSettings')

['w210-capstone-ta:us-central1:w210-therapeutic-accelerator']: An error occurred while performing refresh. Scheduling another refresh attempt immediately
Traceback (most recent call last):
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/google/cloud/sql/connector/instance.py", line 388, in _refresh_task
    refresh_data = await refresh_task
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/google/cloud/sql/connector/instance.py", line 312, in _perform_refresh
    metadata = await metadata_task
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/google/cloud/sql/connector/refresh_utils.py", line 101, in _get_metadata
    resp = await client_session.get(url, headers=headers, raise_for_status=True)
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/aiohttp/client.py", line 640, in _request
    resp.raise_for_status()
  File "/home/nick_lee_berkeley_edu/anaconda3/lib/python3.10/site-packages/

In [None]:
# filename = 'papers-part0.jsonl'
# import jsonlines

# with jsonlines.open('your-filename.jsonl') as f:

#     for line in f.iter():

#         print line['doi'] # or whatever else you'd like to do

In [3]:
import os
import json
from zipfile import ZipFile

import pandas as pd
import numpy as np

In [9]:
mount_path

'/home/nick_lee_berkeley_edu/mount-folder'

In [15]:
z_files = []
for root, dirs, files in os.walk(mount_path, topdown=False):
    for name in files:
        z_files.append(os.path.join(root, name))

In [16]:
z_files

['/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_04b16e60-4286-40b0-a62f-e8837f4b2aa0.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_100832cd-a187-47dd-b091-29981f6b612c.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_196048cb-e25f-49ff-b821-5e08014f9c2b.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_1d70c526-9586-42f5-a525-5c98c50f0da4.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_3749204d-196a-4f70-8425-636c169e5d14.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_3e5fe054-6c1f-47e9-bbb2-80fef44856c1.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_42f02e7a-6814-43e7-a667-e958bc8daeab.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_49b9cbc6-07bb-4bae-9d68-b1958d5d7fec.gz',
 '/home/nick_lee_berkeley_edu/mount-folder/20230526_112152_00030_xnreq_522379af-d3ac-4b56-a378-c

In [None]:
import gzip
import shutil
with gzip.open(z_files[0], 'rb') as f:
    file_content = f.read()

In [None]:
file_content

In [None]:
# get embeddings
from typing import Dict, List
import json

import requests


URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


SAMPLE_PAPERS = [
    {
        "paper_id": "A",
        "title": "Angiotensin-converting enzyme 2 is a functional receptor for the SARS coronavirus",
        "abstract": "Spike (S) proteins of coronaviruses ...",
    },
    {
        "paper_id": "B",
        "title": "Hospital outbreak of Middle East respiratory syndrome coronavirus",
        "abstract": "Between April 1 and May 23, 2013, a total of 23 cases of MERS-CoV ...",
    },
]


def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id

if __name__ == "__main__":
    all_embeddings = embed(SAMPLE_PAPERS)

    # Prints { 'A': [4.089589595794678, ...], 'B': [-0.15814849734306335, ...] }
    print(all_embeddings)