In [4]:
import requests
import json
import os
import time
import boto3
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
import local.config as conf

In [9]:
class semanticScholar():
    
    def __init__(self, release_id=None):
        
        api_key = conf.semanticScholar.api_key
        
        self.headers = headers= {"X-API-KEY": api_key}
        self.release_id = release_id
        
        if release_id is None:
            print("no release id provided, using latest")
            self.release_id = max(self.query_dataset_api())
        
    @sleep_and_retry
    @limits(calls=10, period=32)
    def query_dataset_api(self, dataset=None):
    
        if self.release_id is None:
            url = "https://api.semanticscholar.org/datasets/v1/release/"
        elif dataset is None:
            url = f"https://api.semanticscholar.org/datasets/v1/release/{self.release_id}"
        else:
            url = f"https://api.semanticscholar.org/datasets/v1/release/{self.release_id}/dataset/{dataset}"
    
        response = requests.get(f"{url}", headers=self.headers).json()

        return response
    
    def interate_api(self):
        
        results = []
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        
        datasets = self.query_dataset_api()
        
        for r in datasets["datasets"]:
            result = self.query_dataset_api(dataset=r["name"])
            results.append(result)
        
        self.results = results
    
    def write_results(self):
        
        root = conf.links.local_root
        path = f"{root}/s2/release_id={self.release_id}/time={self.timestr}"
        
        description = []

        if not os.path.exists(f"{path}/links/"):
            os.makedirs(f"{path}/links/")
            
        for r in self.results:
            filename = r["name"]
            meta = { "dataset" : r["name"],
                    "description" : r["description"],
                    "README": r["README"]
                          }
            
            description.append(meta)
    
            with open(f"{path}/links/{filename}.txt", "w") as f:
                f.write('\n'.join(r["files"]))

        with open(f"{path}/description.json", "w") as f:
            f.write(json.dumps(description, indent=2))
            
    def generate_results(self):
        
        self.interate_api()
        self.write_results()
    
    def download_datasets(self):
        ### TODO
        True
        
        

In [8]:
test = semanticScholar()
test.generate_results()

no release id provided, using latest


In [None]:
#class openAlex(self):
    #todo can probably just sync

client = boto3.client('s3')

results = client.list_objects_v2(Bucket="openalex", Prefix= "data/", Delimiter="/")
latestResults = []

datasets = []
for r in results["CommonPrefixes"]:
    datasets.append(r["Prefix"])

for d in datasets:
    results = client.list_objects_v2(Bucket="openalex", Prefix= d, Delimiter="/")
    
    dates = []
    for r in results["CommonPrefixes"]:
        dates.append(r["Prefix"])
    
    latest = {"dataset": d,
              "newest": max(dates)
             }
    latestResults.append(latest)

test = { "name" : "whatever",
        "results" : latestResults
       }
print(test)