In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [16]:
utils.pd_set_options()
_ = """
id_doi_long
id_doi_short
display_name
title
publication_year
item_type
id_pmid_long
id_pmid_short
id_mag
primary_topic_long_id
primary_topic_short_id
primary_topic_display_name
primary_topic_score
primary_topic_subfield_long_id
primary_topic_subfield_display_name
primary_topic_field_long_id
primary_topic_field_display_name
primary_topic_domain_long_id
primary_topic_domain_display_name
primary_topic_subfield_short_id
primary_topic_field_short_id
primary_topic_domain_short_id
"""

wr.athena.read_sql_query("""
SELECT
    id_openalex_short,
    id_doi_short,
    title,
    primary_topic_short_id,
    primary_topic_display_name,
    primary_topic_subfield_long_id,
    primary_topic_subfield_short_id,
    primary_topic_subfield_display_name,
    primary_topic_field_long_id,
    primary_topic_field_short_id,
    primary_topic_field_display_name,
    primary_topic_domain_short_id,
    primary_topic_domain_display_name
 FROM 
     openalex_works_reduced
 WHERE
     primary_topic_field_short_id IS NULL AND
     primary_topic_field_long_id IS NOT NULL
 LIMIT 5
 """, '01_raw')

Unnamed: 0,id_openalex_short,id_doi_short,title,primary_topic_short_id,primary_topic_display_name,primary_topic_subfield_short_id,primary_topic_subfield_display_name,primary_topic_field_short_id,primary_topic_field_display_name,primary_topic_domain_short_id,primary_topic_domain_display_name
0,1648924356,,Evidence-Based Counterterrorism or Flying Blind? How to Understand and Achieve What Works,T11430,Disaster Response and Public Health Preparedness,,Emergency Medical Services,,Health Professions,,Health Sciences
1,1544139659,10.4237/sbqp.09.113,Recomendações Ergonômicas de Banheiros para Crianças com Deficiência Física,T12363,Distance Education in Research and Legislation,,Education,,Social Sciences,,Social Sciences
2,1591148876,,Limitations of the disk-and-washer structure,T11367,Accelerator Technology and Superconducting Cavities,,Aerospace Engineering,,Engineering,,Physical Sciences
3,2417250767,,MaRIE Photoinjector Technology Maturation Requirements,T11663,Recombinant Protein Production in Mammalian and Insect Cells,,Molecular Biology,,"Biochemistry, Genetics and Molecular Biology",,Life Sciences
4,2508405722,,PSYCHOLOGICAL ASPECTS OF ADDING MOBILE LEARNING TO TRADITIONAL METHOD OF TEACHING IN HIGHER EDUCATION,T13978,Impact of Distance Education on Learning and Achievement,,Education,,Social Sciences,,Social Sciences


In [19]:
wr.athena.read_sql_query("""
SELECT
    id_openalex_short,
    id_doi_short,
    title,
    primary_topic_short_id,
    primary_topic_display_name,
    primary_topic_subfield_long_id,
    primary_topic_subfield_short_id,
    primary_topic_subfield_display_name,
    primary_topic_field_long_id,
    primary_topic_field_short_id,
    primary_topic_field_display_name,
    primary_topic_domain_long_id,
    primary_topic_domain_short_id,
    primary_topic_domain_display_name
 FROM 
     openalex_works_reduced
 WHERE
     primary_topic_domain_short_id IS NULL AND
     primary_topic_domain_long_id IS NOT NULL
 LIMIT 10
 """, '01_raw')

Unnamed: 0,id_openalex_short,id_doi_short,title,primary_topic_short_id,primary_topic_display_name,primary_topic_subfield_long_id,primary_topic_subfield_short_id,primary_topic_subfield_display_name,primary_topic_field_long_id,primary_topic_field_short_id,primary_topic_field_display_name,primary_topic_domain_long_id,primary_topic_domain_short_id,primary_topic_domain_display_name
0,1650023746,10.4237/sbqp.09.102,Possibilidades de aplicação de ferramentas de análise da ventilação natural durante a concepção de projetos arquitetônicos,T10121,Building Energy Efficiency and Thermal Comfort Optimization,2215,,Building and Construction,22,,Engineering,3,,Physical Sciences
1,2635976061,,Prediction of population performance based on reliability,T10968,Skew Distributions and Applications in Statistics,2613,,Statistics and Probability,26,,Mathematics,3,,Physical Sciences
2,2730528915,,Santé et travail bien fait : que peut-on apprendre d'autres métiers ?,T14186,Healthcare Policy Reforms and Inequalities in France,3600,,General Health Professions,36,,Health Professions,4,,Health Sciences
3,3178722630,,Persian Sentence-level Sentiment Polarity Classification,T10664,Sentiment Analysis and Opinion Mining,1702,,Artificial Intelligence,17,,Computer Science,3,,Physical Sciences
4,4288090456,,On the high temperature crack propagation in the nickel-based superalloy AD730TM,T13129,Materials Science and Technology,2500,,General Materials Science,25,,Materials Science,3,,Physical Sciences
5,4288104205,,A la rencontre du terahertz,T13807,Digital Communication and Information Studies,1705,,Computer Networks and Communications,17,,Computer Science,3,,Physical Sciences
6,4300432811,,Louis Darquier avant Darquier de Pellepoix. Les enseignements biographiques d'un manuscrit inédit de 1931,T13641,History of Science and Technology,1207,,History and Philosophy of Science,12,,Arts and Humanities,2,,Social Sciences
7,4366974590,,Un regard sur le travail,T11475,Territorial Governance and Environmental Participation,3312,,Sociology and Political Science,33,,Social Sciences,2,,Social Sciences
8,4387659385,,Why is agriculture heterogenous? A neoclassical view,T11743,Critique of Political Economy and Capitalist Development,3312,,Sociology and Political Science,33,,Social Sciences,2,,Social Sciences
9,1506636268,10.4237/sbqp.09.024,Avaliação das Habitações sob enfoque da cultura Guarani,T11858,Urban Geography and Social Development in Brazil,3322,,Urban Studies,33,,Social Sciences,2,,Social Sciences


In [25]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
raw_openalex_works_reduced AS 
(
    SELECT * FROM "01_raw"."openalex_works_reduced" LIMIT 5000
),
base_openalex_works_reduced AS 
(
    SELECT
        id_openalex_short AS id_openalex,
        id_doi_short AS id_doi,
        title,
        COALESCE(primary_topic_short_id, primary_topic_long_id) AS primary_topic_id,
        primary_topic_display_name,
        COALESCE(primary_topic_subfield_short_id, primary_topic_subfield_long_id) AS primary_topic_subfield_id,
        primary_topic_subfield_display_name,
        primary_topic_field_short_id,
        COALESCE(primary_topic_field_short_id, primary_topic_field_long_id) AS primary_topic_field_id,
        primary_topic_field_display_name,
        primary_topic_domain_short_id,
        COALESCE(primary_topic_domain_short_id, primary_topic_domain_long_id) AS primary_topic_domain_id,
        primary_topic_domain_display_name
    FROM
        raw_openalex_works_reduced
)
SELECT * FROM base_openalex_works_reduced WHERE primary_topic_field_short_id IS NULL AND primary_topic_field_display_name IS NOT NULL
""", '01_raw')

Unnamed: 0,id_openalex,id_doi,title,primary_topic_id,primary_topic_display_name,primary_topic_subfield_id,primary_topic_subfield_display_name,primary_topic_field_short_id,primary_topic_field_id,primary_topic_field_display_name,primary_topic_domain_short_id,primary_topic_domain_id,primary_topic_domain_display_name
0,2621689297,,Detection of abnormal aircraft control surface positions using a robust parametric test,T10876,Process Fault Detection and Diagnosis in Industries,2207,Control and Systems Engineering,,22,Engineering,,3,Physical Sciences
1,1650023746,10.4237/sbqp.09.102,Possibilidades de aplicação de ferramentas de análise da ventilação natural durante a concepção de projetos arquitetônicos,T10121,Building Energy Efficiency and Thermal Comfort Optimization,2215,Building and Construction,,22,Engineering,,3,Physical Sciences
2,2635976061,,Prediction of population performance based on reliability,T10968,Skew Distributions and Applications in Statistics,2613,Statistics and Probability,,26,Mathematics,,3,Physical Sciences
3,2730528915,,Santé et travail bien fait : que peut-on apprendre d'autres métiers ?,T14186,Healthcare Policy Reforms and Inequalities in France,3600,General Health Professions,,36,Health Professions,,4,Health Sciences
4,3178722630,,Persian Sentence-level Sentiment Polarity Classification,T10664,Sentiment Analysis and Opinion Mining,1702,Artificial Intelligence,,17,Computer Science,,3,Physical Sciences
5,4288090456,,On the high temperature crack propagation in the nickel-based superalloy AD730TM,T13129,Materials Science and Technology,2500,General Materials Science,,25,Materials Science,,3,Physical Sciences
6,4288104205,,A la rencontre du terahertz,T13807,Digital Communication and Information Studies,1705,Computer Networks and Communications,,17,Computer Science,,3,Physical Sciences
7,4300432811,,Louis Darquier avant Darquier de Pellepoix. Les enseignements biographiques d'un manuscrit inédit de 1931,T13641,History of Science and Technology,1207,History and Philosophy of Science,,12,Arts and Humanities,,2,Social Sciences
8,4366974590,,Un regard sur le travail,T11475,Territorial Governance and Environmental Participation,3312,Sociology and Political Science,,33,Social Sciences,,2,Social Sciences
9,4387659385,,Why is agriculture heterogenous? A neoclassical view,T11743,Critique of Political Economy and Capitalist Development,3312,Sociology and Political Science,,33,Social Sciences,,2,Social Sciences


In [27]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
base_openalex_works_reduced_ AS (
SELECT * FROM "02_stg"."base_openalex_works_reduced"
),
grouped_by_domain AS (
    SELECT
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     GROUP BY
        primary_topic_domain_id,
        primary_topic_domain_display_name
),
grouped_by_domain_and_field AS (
    SELECT
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     WHERE
         TRUE --primary_topic_domain_id = '3' -- "Physical Sciences"
     GROUP BY
         primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name
),
grouped_by_field AS (
    SELECT
        primary_topic_field_id,
        primary_topic_field_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     WHERE
         primary_topic_domain_id = '3' -- "Physical Sciences"
     GROUP BY
        primary_topic_field_id,
        primary_topic_field_display_name
),
grouped_by_subfield_and_field AS (
    SELECT
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name,
        primary_topic_subfield_id,
        primary_topic_subfield_display_name,
        primary_topic_id,
        primary_topic_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     WHERE
         -- primary_topic_domain_id != '3' AND -- 'Physical Sciences'
         primary_topic_field_id  = '17' OR  -- 'Physical Sciences'/'Computer Science'
         --(
         --    primary_topic_subfield_id = '1404' AND -- 'Social Sciences' / 'Business, Management and Accounting' / 'Management Information Systems'
         --    primary_topic_id IN (
         --        'T11572', -- Information Technology Governance and Strategy
         --        'T11734', --	Decision Support System Applications
         --        'T11891', --	Big Data and Business Intelligence
         --    )
         --	
         --)
         
         primary_topic_subfield_id IN (
            '2206', -- 'Physical Sciences' / 'Engineering' / 'Computational Mechanics'
            '2207', -- 'Physical Sciences' / 'Engineering' / 'Control and Systems Engineering'
            '2208', -- 'Physical Sciences' / 'Engineering' / 'Electrical and Electronic Engineering'
            '2302', -- 'Physical Sciences' / 'Environmental Science' / 'Ecological Modeling'
            '2605', -- 'Physical Sciences' / 'Mathematics' / 'Computational Mathematics'
            '2614', -- 'Physical Sciences' / 'Mathematics' / 'Theoretical Computer Science'

            '1404', -- 'Social Sciences' / 'Business, Management and Accounting' / 'Management Information Systems'
            '1802', -- 'Social Sciences' / 'Decision Sciences' / 'Information Systems and Management'
            '2718', -- 'Health Sciences' / 'Medicine' / 'Health Informatics'
            '3605'--, -- 'Health Sciences' / 'Health Professions' / 'Health Information Management'
         )
         
     GROUP BY
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name,
        primary_topic_subfield_id,
        primary_topic_subfield_display_name,
        primary_topic_id,
        primary_topic_display_name
)
SELECT 
    *
FROM
    grouped_by_domain_and_field
WHERE
    TRUE --c > 10000
ORDER BY
    --c DESC,
    primary_topic_domain_id,
    primary_topic_field_id--, 
    --primary_topic_subfield_id,
    --primary_topic_id
 """, '02_stg')

Unnamed: 0,primary_topic_domain_id,primary_topic_domain_display_name,primary_topic_field_id,primary_topic_field_display_name,c
0,1.0,Life Sciences,11.0,Agricultural and Biological Sciences,11026695
1,1.0,Life Sciences,13.0,"Biochemistry, Genetics and Molecular Biology",12085303
2,1.0,Life Sciences,24.0,Immunology and Microbiology,2103246
3,1.0,Life Sciences,28.0,Neuroscience,3290073
4,1.0,Life Sciences,30.0,"Pharmacology, Toxicology and Pharmaceutics",871353
5,2.0,Social Sciences,12.0,Arts and Humanities,14171951
6,2.0,Social Sciences,14.0,"Business, Management and Accounting",6672736
7,2.0,Social Sciences,18.0,Decision Sciences,2081881
8,2.0,Social Sciences,20.0,"Economics, Econometrics and Finance",6106529
9,2.0,Social Sciences,32.0,Psychology,6093849


In [None]:
utils.pd_set_options()
wr.athena.read_sql_query("""
WITH
base_openalex_works_reduced_ AS (
SELECT * FROM "02_stg"."base_openalex_works_reduced"
),
grouped_by_domain AS (
    SELECT
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     GROUP BY
        primary_topic_domain_id,
        primary_topic_domain_display_name
),
grouped_by_domain_and_field AS (
    SELECT
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     WHERE
         TRUE --primary_topic_domain_id = '3' -- "Physical Sciences"
     GROUP BY
         primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name
),
grouped_by_field AS (
    SELECT
        primary_topic_field_id,
        primary_topic_field_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     WHERE
         primary_topic_domain_id = '3' -- "Physical Sciences"
     GROUP BY
        primary_topic_field_id,
        primary_topic_field_display_name
),
grouped_by_subfield_and_field AS (
    SELECT
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name,
        primary_topic_subfield_id,
        primary_topic_subfield_display_name,
        primary_topic_id,
        primary_topic_display_name,
        COUNT(*) AS c
     FROM
        base_openalex_works_reduced_
     WHERE
         -- primary_topic_domain_id != '3' AND -- 'Physical Sciences'
         primary_topic_field_id  = '17' OR  -- 'Physical Sciences'/'Computer Science'
         --(
         --    primary_topic_subfield_id = '1404' AND -- 'Social Sciences' / 'Business, Management and Accounting' / 'Management Information Systems'
         --    primary_topic_id IN (
         --        'T11572', -- Information Technology Governance and Strategy
         --        'T11734', --	Decision Support System Applications
         --        'T11891', --	Big Data and Business Intelligence
         --    )
         --	
         --)
         
         primary_topic_subfield_id IN (
            '2206', -- 'Physical Sciences' / 'Engineering' / 'Computational Mechanics'
            '2207', -- 'Physical Sciences' / 'Engineering' / 'Control and Systems Engineering'
            '2208', -- 'Physical Sciences' / 'Engineering' / 'Electrical and Electronic Engineering'
            '2302', -- 'Physical Sciences' / 'Environmental Science' / 'Ecological Modeling'
            '2605', -- 'Physical Sciences' / 'Mathematics' / 'Computational Mathematics'
            '2614', -- 'Physical Sciences' / 'Mathematics' / 'Theoretical Computer Science'

            '1404', -- 'Social Sciences' / 'Business, Management and Accounting' / 'Management Information Systems'
            '1802', -- 'Social Sciences' / 'Decision Sciences' / 'Information Systems and Management'
            '2718', -- 'Health Sciences' / 'Medicine' / 'Health Informatics'
            '3605'--, -- 'Health Sciences' / 'Health Professions' / 'Health Information Management'
         )
         
     GROUP BY
        primary_topic_domain_id,
        primary_topic_domain_display_name,
        primary_topic_field_id,
        primary_topic_field_display_name,
        primary_topic_subfield_id,
        primary_topic_subfield_display_name,
        primary_topic_id,
        primary_topic_display_name
)
SELECT 
    *
FROM
    grouped_by_domain_and_field
WHERE
    TRUE --c > 10000
ORDER BY
    --c DESC,
    primary_topic_domain_id,
    primary_topic_field_id--, 
    --primary_topic_subfield_id,
    --primary_topic_id
 """, '02_stg')

In [2]:
import pandas as pd
import awswrangler as wr

df = wr.s3.read_csv(path="s3://sagemaker-research-methodology-extraction/01_data/01_raw/openalex/data/merged_ids/works/")
df.head()

Unnamed: 0,merge_date,id,merge_into_id
0,2022-07-15,W3102065232,W2202590217
1,2022-07-18,W3164133144,W4285719527
2,2022-07-18,W3166959377,W4285719527
3,2022-07-27,W3159934294,W21805
4,2022-07-27,W3121891118,W24982


In [3]:
df.shape

(12746997, 3)

In [4]:
df_w_removed = df.copy()
df_w_removed['id'] = df_w_removed['id'].apply(lambda x: x[1:] if x[0]=='W' else f'ERROR: {x}')
df_w_removed['id_starts_with'] = df_w_removed['id'].apply(lambda x: x[0])
df_w_removed['merge_into_id'] = df_w_removed['merge_into_id'].apply(lambda x: x[1:] if x[0]=='W' else f'ERROR: {x}')
df_w_removed['merge_into_id_starts_with'] = df_w_removed['merge_into_id'].apply(lambda x: x[0])
display(df_w_removed.head())

Unnamed: 0,merge_date,id,merge_into_id,id_starts_with,merge_into_id_starts_with
0,2022-07-15,3102065232,2202590217,3,2
1,2022-07-18,3164133144,4285719527,3,4
2,2022-07-18,3166959377,4285719527,3,4
3,2022-07-27,3159934294,21805,3,2
4,2022-07-27,3121891118,24982,3,2


In [5]:
df_w_removed[['id_starts_with']].groupby('id_starts_with').size()

id_starts_with
1    1009250
2    7751135
3    2174166
4    1440313
5      56075
6      64663
7      55789
8      82140
9     113466
dtype: int64

In [6]:
df_w_removed[['merge_into_id_starts_with']].groupby('merge_into_id_starts_with').size()

merge_into_id_starts_with
1      107624
2      464882
3      424372
4    11743764
5        1204
6        1475
7        1071
8        1274
9        1331
dtype: int64

In [15]:
df_l1 = df_w_removed.copy()[['id', 'merge_into_id']]
df_l1['id'] = df_l1['id'].astype('Int64')
df_l1['merge_into_id'] = df_l1['merge_into_id'].astype('Int64')
df_l1.dtypes

id               Int64
merge_into_id    Int64
dtype: object

In [19]:
df_l2 = pd.merge(df_l1, df_l1, left_on='merge_into_id', right_on='id', how='left', suffixes=['', '_2'])
# df_l2['id_2'] = df_l2['id_2'].astype('Int64')
# df_l2['merge_into_id_2'] = df_l2['merge_into_id_2'].astype('Int64')
display(df_l2[~df_l2['id_2'].isna()].head())
display(df_l2[~df_l2['id_2'].isna()].shape)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2
104,2324837190,6741955,6741955,4239741905
1278,3212072798,83386254,83386254,3213645724
2156,3137017720,135121875,135121875,3137616390
2348,2314344449,147670514,147670514,2411777905
4204,2326594437,491721583,491721583,1021879549


(75619, 4)

In [20]:
df_l3 = pd.merge(df_l2, df_l1, left_on='merge_into_id_2', right_on='id', how='left', suffixes=['', '_3'])
# df_l2['id_2'] = df_l2['id_2'].astype('Int64')
# df_l2['merge_into_id_2'] = df_l2['merge_into_id_2'].astype('Int64')
display(df_l3[~df_l3['id_3'].isna()].head())
display(df_l3[~df_l3['id_3'].isna()].shape)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3
1009315,2182411518,2040168896,2040168896,41163576,41163576,4285719527


(1, 6)

In [21]:
df_l4 = pd.merge(df_l3, df_l1, left_on='merge_into_id_3', right_on='id', how='left', suffixes=['', '_4'])
display(df_l4[~df_l4['id_4'].isna()].head())
display(df_l4[~df_l4['id_4'].isna()].shape)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3,id_4,merge_into_id_4


(0, 8)

In [25]:
df_l4['merge_into_id_final'] = (
    df_l4['merge_into_id_4'].combine_first(df_l4['merge_into_id_3'])
        .combine_first(df_l4['merge_into_id_2'])
        .combine_first(df_l4['merge_into_id'])
)

In [26]:
df_l4.head(5)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3,id_4,merge_into_id_4,merge_into_id_final
0,3102065232,2202590217,,,,,,,2202590217
1,3164133144,4285719527,,,,,,,4285719527
2,3166959377,4285719527,,,,,,,4285719527
3,3159934294,21805,,,,,,,21805
4,3121891118,24982,,,,,,,24982


In [27]:
df_l4[~df_l4['merge_into_id_2'].isna()].head(5)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3,id_4,merge_into_id_4,merge_into_id_final
104,2324837190,6741955,6741955,4239741905,,,,,4239741905
1278,3212072798,83386254,83386254,3213645724,,,,,3213645724
2156,3137017720,135121875,135121875,3137616390,,,,,3137616390
2348,2314344449,147670514,147670514,2411777905,,,,,2411777905
4204,2326594437,491721583,491721583,1021879549,,,,,1021879549


In [28]:
df_l4[~df_l4['merge_into_id_3'].isna()].head(5)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3,id_4,merge_into_id_4,merge_into_id_final
1009315,2182411518,2040168896,2040168896,41163576,41163576,4285719527,,,4285719527


In [29]:
df_l4[~df_l4['merge_into_id_4'].isna()].head(5)

Unnamed: 0,id,merge_into_id,id_2,merge_into_id_2,id_3,merge_into_id_3,id_4,merge_into_id_4,merge_into_id_final
