In [None]:
%run ../env_variables.py

In [None]:
import logging
import json
import pandas as pd

import helpers.queries as q
from helpers import handler as h

In [None]:
from elsapy.elsclient import ElsClient
from elsapy.elssearch import ElsSearch

In [None]:
## Load configuration
pd.options.display.max_columns = None

con_file = open(h.scopus_config_file)
config = json.load(con_file)
con_file.close()

In [None]:
## Initialize client
client = ElsClient(config['apikey'])

In [None]:
selected_columns = ['dc:identifier', 'dc:title', 'dc:creator', 'prism:publicationName',
                    'prism:coverDate', 'prism:aggregationType', 'subtypeDescription',
                    'prism:doi', 'eid']

In [None]:
def convert_results_to_dataframe(results: list,
                                 selected_columns=selected_columns
                                 ) -> pd.DataFrame:
    """Convert results to dataframe."""
    results_df = pd.DataFrame.from_records(results)
    results_df = results_df[selected_columns]
    results_df = results_df.drop_duplicates(subset=['dc:identifier'])
    results_df = results_df.reset_index(drop=True)
    logging.info(f'Number of deduplicated results: {len(results_df)}')
    return results_df

In [None]:
def retrieve_results(query: str) -> pd.DataFrame:
    """Retrieve results from Scopus API."""
    # Initialize document search object and execute search
    doc_srch = ElsSearch(query, 'scopus')
    doc_srch.execute(client, get_all=True)
    # Retrieve results
    results = doc_srch.results
    logging.info(f'{len(results)} results retrieved from Scopus API.')
    results_df = convert_results_to_dataframe(results)
    return results_df

In [None]:
mergoni_scopus_df = retrieve_results(q.mergoni_scopus_query)
mergoni_scopus_df.to_csv('data/mergoni_scopus.csv', index=False)

In [None]:
mergoni_scopus_step_1_df = retrieve_results(q.mergoni_scopus_step_1_query)
mergoni_scopus_step_1_df.to_csv('data/mergoni_scopus_step_1.csv', index=False)

In [None]:
mergoni_scopus_step_2_df__countries = pd.concat([
    retrieve_results(query) for query in q.mergoni_scopus_step_2_queries__countries
    ])

In [None]:
def prepare_concatenated_df(
        df: pd.DataFrame,
        max_date = '2022-02-01'
        ) -> pd.DataFrame:
    """Prepare concatenated dataframe."""
    df.drop_duplicates(subset=['dc:identifier'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df = df[df['prism:coverDate'] < max_date]
    return df

In [None]:
mergoni_scopus_step_2_df__countries = prepare_concatenated_df(
    mergoni_scopus_step_2_df__countries)

In [None]:
mergoni_scopus_step_2_df__countries.to_csv(
    'data/mergoni_scopus_step_2_df__countries.csv', index=False
    )

In [None]:
mergoni_scopus_step_2_df__demonyms = pd.concat([
    retrieve_results(query) for query in q.mergoni_scopus_step_2_queries__demonyms
    ])

In [None]:
mergoni_scopus_step_2_df__demonyms = prepare_concatenated_df(
    mergoni_scopus_step_2_df__demonyms)

In [None]:
mergoni_scopus_step_2_df__demonyms.to_csv(
    'data/mergoni_scopus_step_2_df__demonyms.csv', index=False
    )

In [None]:
mergoni_scopus_step_2_df = pd.concat([
    mergoni_scopus_step_2_df__countries,
    mergoni_scopus_step_2_df__demonyms
    ])

In [None]:
mergoni_scopus_step_2_df = prepare_concatenated_df(mergoni_scopus_step_2_df)

In [None]:
mergoni_scopus_step_2_df.to_csv(
    'data/mergoni_scopus_step_2_df.csv', index=False
    )