In [1]:
import boto3
from botocore import UNSIGNED
from botocore.client import Config

import json
import gzip
import shutil
import os
import time
from datetime import datetime
from tqdm import tqdm

In [2]:
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED), region_name='us-east-1')
entries = s3.list_objects(Bucket='openalex', Marker='data/works', Prefix='data/works/updated_date')['Contents']
# get all object filenames
data_objects = []
for entry in entries:
    data_objects.append(entry['Key'])

In [3]:
# Download object file from s3
def download_object(s3, obj):
    raw = 'datalake/raw'
    os.makedirs(raw, exist_ok=True)
    file_name = obj.split('/')[-1]
    raw_file_path = os.path.join(raw, file_name)
    # download gzip file ar raw_file_path
    s3.download_file(Bucket='openalex', Key = obj, Filename=raw_file_path)
    save_file_name = obj.split('/')[-2].split('=')[-1] + file_name.split('.gz')[0]+'.json'
    save_file_path = os.path.join(raw, save_file_name)
    # extract data from gzip
    with gzip.open(raw_file_path, 'rb') as f_in:
        with open(save_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    #gzip_file = gzip.GzipFile(raw_file_path, 'rb')
    #with open(save_file_path, 'wb') as f_out:
    #    f_out.write(gzip_file.read())
    # delete zipped file once extracted
    os.remove(raw_file_path)
    return save_file_path

# Get filtered contents
def get_object_contents(save_file_name):
    # columns which are not required
    delcolList = ['doi', 
                  'title', 
                  'ids', 
                  'open_access', 
                  'biblio', 
                  'is_paratext', 
                  'mesh', 
                  'alternate_host_venues', 
                  'ngrams_url', 
                  'abstract_inverted_index', 
                  'created_date']
    
    object_content = []
    total = 0
    filtered = 0
    with open(save_file_name) as f:
        for row in f:
            total += 1
            # parse json
            r_work = json.loads(row)
            # remove extra columns
            for col in delcolList:
                r_work.pop(col, None)
            # apply filter on Computer science concepts and publication_year
            if r_work['publication_year'] is not None and  r_work['publication_year'] >= 2010:
                concepts = r_work['concepts']
                for concept in concepts:
                    if concept['display_name'].lower() == 'computer science' or concept['id'] == 'https://openalex.org/c41008148':
                        object_content.append(r_work)
                        filtered += 1
                        break
    return object_content, total, filtered

# write file to local system
def write_file(obj, jsonContent, obj_file_name):
    raw = 'datalake/raw'
    d = obj.split('/')[-2]
    f = obj.split('/')[-1].replace('gz', 'json')
    os.makedirs(os.path.join(raw, d), exist_ok=True)
    filepath = os.path.join(raw, d, f)
    with open(filepath, 'w') as outfile:
        json.dump(jsonContent, outfile)
    os.remove(obj_file_name)
    return filepath

In [1]:
# download and process all files one by one
logger = 'data_download.log'
now = datetime.now()
cur_time = now.strftime("%Y-%m-%d %H:%M:%S")
with open(logger, 'a') as log:
    log.write(f'start_time:{cur_time}\n{"=="*30}\n')

start_time = time.time()
start_from = 0
for obj in tqdm(data_objects[start_from:]):
#for obj in [data_objects[0]]:
    obj_file_name = download_object(s3, obj)
    object_content, total, filtered = get_object_contents(obj_file_name)
    if len(object_content) > 0:
        outfilepath = write_file(obj, object_content, obj_file_name)
        # total_memory_written += os.path.getsize(outfilepath)
        with open(logger, 'a') as log:
            log.write(f'{obj} file written at: {outfilepath}, total rows:{total}, rows after filter: {filtered}\n')
    else:
        with open(logger, 'a') as log:
            log.write(f'{obj} file not written, total rows:{total}, rows after filter: {filtered}\n')
        os.remove(obj_file_name)
end_time = time.time()
print(f'Log file written at: {os.path.join(os.getcwd(),logger)}')

now = datetime.now()
cur_time = now.strftime("%Y-%m-%d %H:%M:%S")
with open(logger, 'a') as log:
    log.write(f'{"=="*30}\nend_time:{cur_time}\nTotal time taken:{(end_time-start_time)/60} minutes\n')