In [1]:
from pandas import read_pickle, DataFrame, concat
from tqdm import tqdm
from thefuzz import fuzz

from requests import post as post_request

from glob import glob

from time import sleep

from dotenv import load_dotenv
from os import getenv

load_dotenv()


API_KEY = getenv('API_KEY')

In [2]:
GENERAL_FIELDS = [
    'paperId',
    'corpusId', # Su tipado es inconsistente, podría traer errores.
    'url',
    'title',
    'venue',
    'publicationVenue',
    'year',
    'authors',
    'externalIds',
    'abstract',
    'referenceCount',
    'citationCount',
    'influentialCitationCount',
    'isOpenAccess',
    'openAccessPdf',
    'fieldsOfStudy',
    's2FieldsOfStudy',
    'publicationTypes',
    'publicationDate',
    'journal',
]

AUTHORS_FIELDS = [
    'authorId',
    'externalIds', # No siempre esta presente, podría traer errores.
    'url',
    'name',
    'aliases',
    'affiliations',
    'paperCount',
    'citationCount'
]

fields = GENERAL_FIELDS + ['tldr'] + [f'authors.{i}' for i in AUTHORS_FIELDS]
fields += [f'{j}.{i}' for j in ['references', 'citations']
           for i in GENERAL_FIELDS]

QUERY_FIELDS = ','.join(fields)

In [3]:
def chunk_list(input_list, chunk_size):
    """Divide una lista en sub-listas de tamaño chunk_size.

    Args:
        input_list (list): Lista original que se va a dividir.
        chunk_size (int): Tamaño deseado para las sub-listas.

    Returns:
        list of lists: Lista de sub-listas, donde cada sub-lista tiene un tamaño de hasta chunk_size.
    """
    # Usar una comprensión de lista para generar las sub-listas
    return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]

In [4]:
df = read_pickle('../data/api_request_results/retrieved_data.zip').reset_index(drop=True)


In [5]:
df['simi_score'] = [fuzz.partial_ratio(row.title.strip().lower(), row.db_title.strip().lower()) for row in df.itertuples()]
df.sort_values(['db_id','simi_score'], inplace=True)
df.drop_duplicates(['db_id'], keep='last', inplace=True)
df.reset_index(drop=True, inplace=True)

In [6]:
# Realmente queremos quedarnos unicamente con los papers del dataset?
df_filter = df.query('simi_score>85') # hay falsos positivos, pero no muchos

# O unicamente con los papers que son de acceso abierto (pdf descargable)
df_filter = df_filter[df_filter.isOpenAccess].reset_index(drop=True)

del df
# usar todo => más datos!

In [7]:
dfs = []
for row in tqdm(df_filter.itertuples()):
    
    refs_df = DataFrame(row.references)
    refs_df['db_id'] = row.db_id
    refs_df['con_type'] = 'reference'
    
    cits_df = DataFrame(row.citations)
    cits_df['db_id'] = row.db_id
    cits_df['con_type'] = 'citation'
    
    con_df = concat([refs_df, cits_df], ignore_index=True)
    
    dfs.append(con_df)
    
cons_df = concat(dfs, ignore_index=True)    
cons_df.drop_duplicates('paperId', inplace=True)
cons_df['isOpenAccess'] = cons_df.isOpenAccess.fillna(False)

cons_df = cons_df[cons_df.isOpenAccess]
cons_df = cons_df.sample(len(cons_df)).reset_index(drop=True)

1396it [00:02, 601.19it/s]
  cons_df['isOpenAccess'] = cons_df.isOpenAccess.fillna(False)


In [8]:
print("Original len: ", len(cons_df))

already_cons = read_pickle('../data/api_request_results/retrieved_data_connections.zip')

df = cons_df[~cons_df['paperId'].isin(already_cons['paperId'])]
df = df.sample(len(df), ignore_index=True)
print("Actual len: ", len(df))

del already_cons

sub_lists = chunk_list(df['paperId'].to_list(), chunk_size=10)

for chunk in tqdm(sub_lists):

    res = post_request(
        'https://api.semanticscholar.org/graph/v1/paper/batch',
        params={'fields': QUERY_FIELDS},
        json={"ids": chunk}
        )
    
    if res.status_code == 200:
        data = res.json()

        df_res = DataFrame.from_dict(data)

        df_res.to_pickle(f'../data/api_request_results/connections/chunk_{df_res.paperId.iloc[0]}.zip',
                        compression={
                            'method': 'zip',
                            'compresslevel': 9  # Nivel máximo de compresión para ZIP
                        }
                        )
            
        sleep(2)
    elif res['status_code'] == 429:
        print(
            'Too many requests. Waiting 180 seconds')
        sleep(180)
    else:
        print('Error:', res)
        break

Original len:  62054
Actual len:  62024


  0%|          | 7/6203 [01:45<27:10:51, 15.79s/it]

In [None]:
retrieved_data = concat(
                    [read_pickle(f) for f in tqdm(glob('../data/api_request_results/connections/*'))]
                    )


retrieved_data.to_pickle('../data/api_request_results/retrieved_data_connections.zip', 
                            compression= {
                            'method': 'zip',
                            'compresslevel': 9  # Nivel máximo de compresión para ZIP
                            }
                    )

100%|██████████| 3/3 [00:00<00:00, 37.22it/s]
