# Setup

## Packages

In [16]:
import pandas as pd
import numpy as np

from tqdm import tqdm
import time

# for using configuration files like yamls. This is to help key our keys safe
import yaml # for configuration files 
import hydra
from omegaconf import DictConfig, OmegaConf

import multiprocessing
import requests

import urllib
import os
import json


# unzip files
import gzip
import shutil

# @hydra.main(config_path="../conf", config_name="main", version_base=None)

In [2]:
with open("../config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)

root_path = "/home/nick_lee_berkeley_edu/"
mount_path = os.path.join(root_path, "mount-folder")

In [None]:
from semanticscholar import SemanticScholar
sch = SemanticScholar(api_key=keys['s2_api_key'])

# Functions

In [None]:
# download paper
def get_paper(url, file_path): 
    """ url to the file and the file_name to download it as """
    if os.path.isfile(file_path) == False:
            try: 
                urllib.request.urlretrieve(url, file_path)
                urllib.request.urlcleanup()
            except: 
                print(f"Error for {file_path}")
        else: 
            print(f"{i}, File Exists")
    

def unzip_file(zip_file_path, ext_file_path): 
    """ make sure the current working directory is set to where you want the files """
    if os.path.isfile(ext_file_path) == False:
        try: 
            with gzip.open(zip_file_path, 'rb') as f_in:
                with open(ext_file_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
        except: 
            print(f"Was not able to extract file {ext_file_path}")
    else:
        print(f"{i}, File Exists")

In [None]:
# Multithreading
def download(link, filelocation):
    r = requests.get(link, stream=True)
    with open(filelocation, 'wb') as f:
        for chunk in r.iter_content(1024):
            if chunk:
                f.write(chunk)

def createNewDownloadThread(link, filelocation):
    download_thread = threading.Thread(target=download, args=(link,filelocation))
    download_thread.start()

# for i in range(0,5):
#     file = "C:\\test" + str(i) + ".png"
#     print file
#     createNewDownloadThread("http://stackoverflow.com/users/flair/2374517.png", file)

## Get Papers and latests releases

In [None]:
# Get info about the papers dataset
papers = requests.get(config['semantic_scholar']['papers'],
                      headers={'x-api-key':keys['x-api-key']}).json()

In [None]:
# Get info about the latest release
latest_release = requests.get(config['semantic_scholar']['latest']).json()

# Get info about past releases
dataset_ids = requests.get(config['semantic_scholar']['release']).json()
earliest_release = requests.get(f"http://api.semanticscholar.org/datasets/v1/release/{dataset_ids[0]}").json()

## Download Files
Create the file paths for the zipped and extracted

In [None]:
download_flag = False
extract_flag = False

In [None]:
# create base file names
file_names = [f"papers-part{n}.jsonl.gz" for n in range(len(papers['files']))]

# create zipped file paths
paper_zip = [os.path.join(mount_path, "zipped", f) for f in file_names]

# create extracted file paths
paper_fn = [os.path.join(mount_path, "extracted", f.strip("\.gz")) for f in file_names]

In [None]:
# Takes ~30 minutes
if download_flag == True: 
    with multiprocessing.Pool() as pool:
        pool.starmap(unzip_file, zip(papers['files'], paper_zip), chunksize=10)
    # downloads the files directly into the google cloud bucket
    for i, (url, file_path) in tqdm(enumerate(zip(papers['files'], paper_zip))): 
        if os.path.isfile(file_path) == False:
            try: 
                get_paper(url, file_path)
            except: 
                print(f"Error for {file_path}")
        else: 
            print(f"{i}, File Exists")

In [None]:
## Multiprocessing to extract multiple files at once
# Takes ~FOREVER minutes
if extract_flag == True: 
    with multiprocessing.Pool() as pool:
        pool.starmap(unzip_file, zip(paper_zip, paper_fn), chunksize=3)

In [None]:
# Takes ~ FOREVER mins
if extract_flag == True: 
    for tqdm(file_path) in paper_zip:
        if os.path.isfile(file_path) == False:
            unzip_file(file_path)
        else: 
            print(f"{i}, File Exists")

## Parse JSON file to upload into database

In [5]:
import glob

In [20]:
attribute_files = glob.glob("".join([mount_path, '/extracted/*?.jsonl']))
attribute_files

['/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part0.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part1.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part10.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part11.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part12.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part2.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part20.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part21.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part22.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part3.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part4.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part5.jsonl',
 '/home/nick_lee_berkeley_edu/mount-folder/extracted/papers-part6.jsonl',
 '/home/nick_lee_berkeley_edu/mo

In [40]:
len(json_list)

6971179

In [23]:
def json_to_df(j): 
    """ Create dataframe to upload into database """
    return pd.DataFrame([json.loads(j)])

## Upload Data to Postgres DB

### Create connection to database

In [5]:
from sqlalchemy.orm import Session
from sqlalchemy import select, Table, Column, Integer, String, Boolean, MetaData
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.types import ARRAY

In [3]:
# connection libraries
from google.cloud.sql.connector import Connector, IPTypes
import pg8000
import sqlalchemy

# connect to goolge cloud postgres db
def connect_with_connector() -> sqlalchemy.engine.base.Engine:
    """
    Initializes a connection pool for a Cloud SQL instance of Postgres.

    Uses the Cloud SQL Python Connector package.
    """
    # Note: Saving credentials in environment variables is convenient, but not
    # secure - consider a more secure solution such as
    # Cloud Secret Manager (https://cloud.google.com/secret-manager) to help
    # keep secrets safe.

    instance_connection_name = config['database']['connection'] # i.e demo-project:us-central1:demo-instance
    db_user = config['database']['user']
    db_pass = config['database']['password']
    db_name = config['database']['name']

    ip_type = IPTypes.PRIVATE # if os.environ.get("PRIVATE_IP") else IPTypes.PUBLIC

    # initialize Cloud SQL Python Connector object
    connector = Connector()

    def getconn() -> pg8000.dbapi.Connection:
        conn: pg8000.dbapi.Connection = connector.connect(
            instance_connection_name,
            "pg8000",
            db=db_name,
            user=db_user,
            password = db_pass,
            # enable_iam_auth=True,
            ip_type=ip_type
        )
        return conn

    # The Cloud SQL Python Connector can be used with SQLAlchemy
    # using the 'creator' argument to 'create_engine'
    pool = sqlalchemy.create_engine(
        "postgresql+pg8000://",
        creator=getconn,
        # [START_EXCLUDE]
        # Pool size is the maximum number of permanent connections to keep.
        pool_size=5,
        # Temporarily exceeds the set pool_size if no connections are available.
        max_overflow=2,
        # The total number of concurrent connections for your application will be
        # a total of pool_size and max_overflow.
        # 'pool_timeout' is the maximum number of seconds to wait when retrieving a
        # new connection from the pool. After the specified amount of time, an
        # exception will be thrown.
        pool_timeout=30,  # 30 seconds
        # 'pool_recycle' is the maximum number of seconds a connection can persist.
        # Connections that live longer than the specified amount of time will be
        # re-established
        pool_recycle=1800,  # 30 minutes
        # [END_EXCLUDE]
    )
    return pool

pool = connect_with_connector()
pool.connect()

<sqlalchemy.engine.base.Connection at 0x7f9e2cba2260>

In [None]:
meta = MetaData()

articles = Table(
    'article_attributes', meta,
    Column('corpusid', Integer, primary_key = True),
    Column('externalids', String),
    Column('url', String),
    Column('title', Integer),
    Column('authors', ARRAY(JSONB)),
    Column('venue', String),
    Column('publicationvenueid', Integer),
    Column('year', String),
    Column('referencecount', Integer),
    Column('citationcount', Integer),
    Column('influentialcitationcount', Integer),
    Column('isopenaccess', Boolean),
    Column('s2fieldsofstudy', ARRAY(JSONB)),
    Column('publicationtypes', Integer),
    Column('publicationdate', Integer),
    Column('journal', JSONB),
    Column('updated', String)    
)

# create table in database
meta.create_all(pool)

In [7]:
def df_to_db(df): 
    with pool.connect() as db_conn:
        df.to_sql('article_attributes', con = db_conn, if_exists='append', index = False)

In [8]:
# import orjson # for faster reading of json
import jsonlines # for opening jsonl files

In [9]:
def preprocess_df(df): 
    df.year = df.year.astype("Int64")
    return df

In [None]:
# test connection 
# connect to connection pool
with pool.connect() as db_conn:
    # create ratings table in our sandwiches database
    results = db_conn.execute(sqlalchemy.text("SELECT * FROM article_attributes")).fetchall()
    
    # # show results
    # for row in results:
    #     print(row)

In [None]:
import os
import json
from zipfile import ZipFile

import pandas as pd
import numpy as np

In [None]:
z_files = []
for root, dirs, files in os.walk(mount_path, topdown=False):
    for name in files:
        z_files.append(os.path.join(root, name))

In [None]:
z_files

In [None]:
import gzip
import shutil
with gzip.open(z_files[0], 'rb') as f:
    file_content = f.read()

In [None]:
file_content

In [None]:
# get embeddings
from typing import Dict, List
import json

import requests


URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


SAMPLE_PAPERS = [
    {
        "paper_id": "A",
        "title": "Angiotensin-converting enzyme 2 is a functional receptor for the SARS coronavirus",
        "abstract": "Spike (S) proteins of coronaviruses ...",
    },
    {
        "paper_id": "B",
        "title": "Hospital outbreak of Middle East respiratory syndrome coronavirus",
        "abstract": "Between April 1 and May 23, 2013, a total of 23 cases of MERS-CoV ...",
    },
]


def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id

if __name__ == "__main__":
    all_embeddings = embed(SAMPLE_PAPERS)

    # Prints { 'A': [4.089589595794678, ...], 'B': [-0.15814849734306335, ...] }
    print(all_embeddings)