In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
import pandas as pd
import awswrangler as wr
# Note: needs bigger memory instance
df = wr.s3.read_csv(path="s3://sagemaker-research-methodology-extraction/01_data/01_raw/openalex/data/merged_ids/works/")
df.head()

Unnamed: 0,merge_date,id,merge_into_id
0,2022-07-15,W3102065232,W2202590217
1,2022-07-18,W3164133144,W4285719527
2,2022-07-18,W3166959377,W4285719527
3,2022-07-27,W3159934294,W21805
4,2022-07-27,W3121891118,W24982


In [3]:
df_w_removed = df.copy()
df_w_removed['id'] = df_w_removed['id'].apply(lambda x: x[1:] if x[0]=='W' else f'ERROR: {x}')
df_w_removed['id_starts_with'] = df_w_removed['id'].apply(lambda x: x[0])
df_w_removed['merge_into_id'] = df_w_removed['merge_into_id'].apply(lambda x: x[1:] if x[0]=='W' else f'ERROR: {x}')
df_w_removed['merge_into_id_starts_with'] = df_w_removed['merge_into_id'].apply(lambda x: x[0])
display(df_w_removed.head())

Unnamed: 0,merge_date,id,merge_into_id,id_starts_with,merge_into_id_starts_with
0,2022-07-15,3102065232,2202590217,3,2
1,2022-07-18,3164133144,4285719527,3,4
2,2022-07-18,3166959377,4285719527,3,4
3,2022-07-27,3159934294,21805,3,2
4,2022-07-27,3121891118,24982,3,2


In [4]:
df_l1 = df_w_removed.copy()[['id', 'merge_into_id']]
df_l1['id'] = df_l1['id'].astype('Int64')
df_l1['merge_into_id'] = df_l1['merge_into_id'].astype('Int64')
df_l1.dtypes

id               Int64
merge_into_id    Int64
dtype: object

In [5]:
df_l2 = pd.merge(df_l1, df_l1, left_on='merge_into_id', right_on='id', how='left', suffixes=['', '_2'])
display(df_l2[~df_l2['id_2'].isna()].head())
display(df_l2[~df_l2['id_2'].isna()].shape)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2
104,2324837190,6741955,6741955,4239741905
1278,3212072798,83386254,83386254,3213645724
2156,3137017720,135121875,135121875,3137616390
2348,2314344449,147670514,147670514,2411777905
4204,2326594437,491721583,491721583,1021879549


(75619, 4)

In [6]:
df_l3 = pd.merge(df_l2, df_l1, left_on='merge_into_id_2', right_on='id', how='left', suffixes=['', '_3'])
display(df_l3[~df_l3['id_3'].isna()].head())
display(df_l3[~df_l3['id_3'].isna()].shape)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3
1009315,2182411518,2040168896,2040168896,41163576,41163576,4285719527


(1, 6)

In [7]:
df_l4 = pd.merge(df_l3, df_l1, left_on='merge_into_id_3', right_on='id', how='left', suffixes=['', '_4'])
display(df_l4[~df_l4['id_4'].isna()].head())
display(df_l4[~df_l4['id_4'].isna()].shape)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3,id_4,merge_into_id_4


(0, 8)

In [8]:
df_l4['merge_into_id_final'] = (
    df_l4['merge_into_id_4'].combine_first(df_l4['merge_into_id_3'])
        .combine_first(df_l4['merge_into_id_2'])
        .combine_first(df_l4['merge_into_id'])
)

In [10]:
wr.s3.to_parquet(
    df=df_l4,
    path='s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_merged_ids',
    dataset=True,
    database='02_stg',
    table='base_openalex_merged_ids'
)

{'paths': ['s3://sagemaker-research-methodology-extraction/01_data/02_stg/base_openalex_merged_ids/0814a36c0e834eb6a550ff5a068158f7.snappy.parquet'],
 'partitions_values': {}}