In [99]:
import requests
import json
import os
import time
import boto3
from ratelimit import limits, RateLimitException, sleep_and_retry
from backoff import on_exception, expo
import local.config as conf

In [52]:
class semanticScholar():
    
    def __init__(self, release_id=None):
        
        api_key = conf.semanticScholar.api_key
        
        self.headers = headers= {"X-API-KEY": api_key}
        self.release_id = release_id
        
        if release_id is None:
            print("no release id provided, using latest")
            self.release_id = max(self.query_dataset_api())
        
    @sleep_and_retry
    @limits(calls=10, period=32)
    def query_dataset_api(self, dataset=None):
    
        if self.release_id is None:
            url = "https://api.semanticscholar.org/datasets/v1/release/"
        elif dataset is None:
            url = f"https://api.semanticscholar.org/datasets/v1/release/{self.release_id}"
        else:
            url = f"https://api.semanticscholar.org/datasets/v1/release/{self.release_id}/dataset/{dataset}"
    
        response = requests.get(f"{url}", headers=self.headers).json()

        return response
    
    def interate_api(self):
        
        results = []
        self.timestr = time.strftime("%Y%m%d-%H%M%S")
        
        datasets = self.query_dataset_api()
        
        for r in datasets["datasets"]:
            result = self.query_dataset_api(dataset=r["name"])
            results.append(result)
        
        self.results = results
    
    def write_results(self):
        
        description = []

        if not os.path.exists(f"./local/semantic_scholar/release_id={self.release_id}/time={self.timestr}/links/"):
            os.makedirs(f"./local/semantic_scholar/release_id={self.release_id}/time={self.timestr}/links/")
            
        for r in self.results:
            filename = r["name"]
            meta = { "dataset" : r["name"],
                    "description" : r["description"],
                    "README": r["README"]
                          }
            description.append(meta)
    
            with open(f"./local/semantic_scholar/release_id={self.release_id}/time={self.timestr}/links/{filename}.txt", "w") as f:
                f.write('\n'.join(r["files"]))

        with open(f"./local/semantic_scholar/release_id={self.release_id}/time={self.timestr}/description.json", "w") as f:
            f.write(json.dumps(description, indent=2))
            
    def generate_results(self):
        
        self.interate_api()
        self.write_results()
    
    def download_datasets(self):
        ### TODO

In [53]:
test = semanticScholar()
test.generate_results()

no release id provided, using latest


In [100]:
#class openAlex(self):
    #todo can probably just sync

client = boto3.client('s3')

results = client.list_objects_v2(Bucket="openalex", Prefix= "data/", Delimiter="/")
latestResults = []

datasets = []
for r in results["CommonPrefixes"]:
    datasets.append(r["Prefix"])

for d in datasets:
    results = client.list_objects_v2(Bucket="openalex", Prefix= d, Delimiter="/")
    
    dates = []
    for r in results["CommonPrefixes"]:
        dates.append(r["Prefix"])
    
    latest = {"dataset": d,
              "newest": max(dates)
             }
    latestResults.append(latest)
    
print(latestResults)

[{'dataset': 'data/authors/', 'newest': 'data/authors/updated_date=2022-06-07/'}, {'dataset': 'data/concepts/', 'newest': 'data/concepts/updated_date=2022-05-31/'}, {'dataset': 'data/institutions/', 'newest': 'data/institutions/updated_date=2022-06-01/'}, {'dataset': 'data/merged_ids/', 'newest': 'data/merged_ids/works/'}, {'dataset': 'data/venues/', 'newest': 'data/venues/updated_date=2022-06-03/'}, {'dataset': 'data/works/', 'newest': 'data/works/updated_date=2022-05-28/'}]


In [97]:
for r in results:
    print(r)

ResponseMetadata
IsTruncated
Contents
Name
Prefix
Delimiter
MaxKeys
CommonPrefixes
EncodingType
KeyCount


In [101]:
for r in results["Contents"]:
    print(r)

{'Key': 'data/works/manifest', 'LastModified': datetime.datetime(2022, 6, 14, 17, 1, 37, tzinfo=tzutc()), 'ETag': '"2a8f1d9a81611ff6ad5bf84909b297ce"', 'Size': 18348, 'StorageClass': 'STANDARD'}


In [106]:
results["KeyCount"]

33