# Setup

In [1]:
import os
import pandas as pd
import numpy as np
import json

from tqdm.auto import tqdm
import time

In [3]:
import yaml
with open("../config/main.yaml", "r") as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

with open("../config/keys.yaml", "r") as f:
    keys = yaml.load(f, Loader=yaml.FullLoader)

root_path = config['paths']['root']
mount_path = os.path.join(root_path, config['paths']['papers'])
working_path = os.path.join(root_path, config['paths']['wkdir'])

# Get Full Text

In [4]:
import requests
# full text
# follow s3 link
full_text = requests.get("http://api.semanticscholar.org/datasets/v1/release/latest/dataset/s2orc",
                      headers={'x-api-key':keys['x-api-key']}).json()

{'name': 's2orc',
 'description': 'Full-body paper text parsed from open-access PDFs. Identifies structural elements such as paragraphs, sections, and bibliography entries.\n5M records in 30 4GB files.',
 'README': 'Semantic Scholar Academic Graph Datasets\n\nThe "s2orc" dataset contains parsed full-body text from selected papers.\n\nA subset of this data was previously released (in a different format) as S2ORC https://github.com/allenai/s2orc\n\nThe body text is parsed from PDF documents using Grobid, documented at https://grobid.readthedocs.io.\nIts output is converted from XML into a single string with a set of annotation spans.\n\nSCHEMA\n - externalIds: IDs of this paper in different catalogs\n - content:\n   - source:\n\t   - pdfUrls: URLs to the PDF\n\t   - oaInfo: license/url/status information from Unpaywall\n   - text: Full body text as a single string\n   - annotations: Annotated spans of the full body text\n\n\nLICENSE\nThis collection is licensed under ODC-BY. (https://ope

In [80]:
# Attributes
# follow s3 link
attributes = requests.get("http://api.semanticscholar.org/datasets/v1/release/latest/dataset/papers",
                      headers={'x-api-key':keys['x-api-key']}).json()

In [8]:
# download text
r = requests.get(full_text['files'][0], allow_redirects=True)

In [84]:
with open('facebook.ico', 'wb')

<Response [200]>

# Parse Text

In [3]:
# Get all file paths. It is in attributes but it is the full papers
import glob
attribute_files = glob.glob("".join([mount_path, '/*?[!.gz|!.zip]']))
print(len(attribute_files))
attribute_files

29


['/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_0cadbe3e-d976-4ddd-a419-5338f4bdb9b9',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_1cf93f6a-2a54-4761-a6e2-be64df72abfd',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_277c5193-816a-41df-914a-63ce8739d3db',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_2867ec04-ebb0-4c0a-9cca-bab125db6b3a',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_3090aa34-c242-40a3-af8f-ddd0a091fd06',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_34b5fd04-aadf-4596-9d88-04fb2446f3ee',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_3dcdac3f-ae5e-40c5-b3aa-839df4c9fcab',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_3e82aa4a-c1aa-4a75-8ff8-8187d98a6091',
 '/home/nick_lee_berkeley_edu/mount-attributes/20230609_070343_00047_bkw4d_4917e161-6ddb

In [15]:
with open(attribute_files[0], 'r') as f: 
    f.seek(0)
    data = f.read(2000 - 0)

In [13]:
import jsonlines
temp_df = pd.DataFrame()
with jsonlines.open(attribute_files[0]) as f:
    count = 0
    for line in tqdm(f.iter()): 
        if count == 100: 
            break
        else: 
            temp_df = pd.concat([temp_df,pd.DataFrame([line])]) # reads json, converts to dataframe, preprocess functions and appends results to database
            count += 1

0it [00:00, ?it/s]

In [21]:
temp_df.reset_index(inplace=True, drop=True)

In [69]:
tdf_2 = temp_df.query('isopenaccess == True')

In [74]:
tdf_2.url.str.rsplit('/')

67    [https:, , www.semanticscholar.org, paper, 5f7...
93    [https:, , www.semanticscholar.org, paper, 2f6...
Name: url, dtype: object

In [44]:
papers = json.loads(temp_df.to_json(orient = 'records'))

In [48]:
# get embeddings
from typing import Dict, List
import json
import requests

URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]


# SAMPLE_PAPERS = [
#     {
#         "paper_id": "A",
#         "title": "Angiotensin-converting enzyme 2 is a functional receptor for the SARS coronavirus",
#         "abstract": "Spike (S) proteins of coronaviruses ...",
#     },
#     {
#         "paper_id": "B",
#         "title": "Hospital outbreak of Middle East respiratory syndrome coronavirus",
#         "abstract": "Between April 1 and May 23, 2013, a total of 23 cases of MERS-CoV ...",
#     },
# ]


def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        # print(chunk)
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = paper["embedding"]

    return embeddings_by_paper_id

# if __name__ == "__main__":
#     all_embeddings = embed(SAMPLE_PAPERS)

#     # Prints { 'A': [4.089589595794678, ...], 'B': [-0.15814849734306335, ...] }
#     print(all_embeddings)

In [55]:
URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16

def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]

In [59]:
papers[0]

{'corpusid': 209821356,
 'externalids': {'ACL': None,
  'DBLP': None,
  'ArXiv': None,
  'MAG': '885611991',
  'CorpusId': '209821356',
  'PubMed': None,
  'DOI': None,
  'PubMedCentral': None},
 'url': 'https://www.semanticscholar.org/paper/a12266fce7d3b22b5a4c9cac9c7a3675f9cc1f46',
 'title': '星形细胞瘤热休克蛋白70（HSP70）表达的研究',
 'authors': [{'authorId': '66573337', 'name': '王建中'},
  {'authorId': '80068024', 'name': '蔡晓东'},
  {'authorId': '66486956', 'name': '丁建军'},
  {'authorId': '81021948', 'name': '李维平'},
  {'authorId': '1474054841', 'name': '付友增'},
  {'authorId': '1475846842', 'name': '陈耕野'},
  {'authorId': '1996397447', 'name': '肖邦良'},
  {'authorId': '66480329', 'name': '毛伯镛'},
  {'authorId': '69408990', 'name': '罗宏'},
  {'authorId': '83269768', 'name': '张文燕'}],
 'venue': '',
 'publicationvenueid': None,
 'year': 2002,
 'referencecount': 0,
 'citationcount': 0,
 'influentialcitationcount': 0,
 'isopenaccess': False,
 's2fieldsofstudy': None,
 'publicationtypes': None,
 'publicationdate': 

In [63]:
response = requests.post(URL, json=papers[0:10])
response

<Response [502]>

In [60]:
for chunk in chunks(papers[0]):
    # print(chunk)
    response = requests.post(URL, json=chunk)

TypeError: unhashable type: 'slice'

In [49]:
all_embeddings = embed(papers)

RuntimeError: Sorry, something went wrong, please try later!

# Upload Text

Create connection and add "pool" object to environment. Used for connecting to DB

In [None]:
# 
exec(open(os.path.join(working_path, 'sql_db_connection.py')).read())
pool

## Create Table

In [None]:
meta = MetaData()

articles = Table(
    'article_text', meta,
    Column('corpusid', Integer, primary_key = True),
    Column('externalids', String),
    Column('url', String),
    Column('title', Integer),
    Column('authors', ARRAY(JSONB)),
    Column('venue', String),
    Column('publicationvenueid', Integer),
    Column('year', String),
    Column('referencecount', Integer),
    Column('citationcount', Integer),
    Column('influentialcitationcount', Integer),
    Column('isopenaccess', Boolean),
    Column('s2fieldsofstudy', ARRAY(JSONB)),
    Column('publicationtypes', Integer),
    Column('publicationdate', Integer),
    Column('journal', JSONB),
    Column('updated', String)    
)

# create table in database
meta.create_all(pool)

## Start Upload