In [None]:
%rm ../main.log
%rm ../dump.json

Define environment variables and handler

In [None]:
%run ../env_variables.py

Import required libraries

In [None]:
import logging

from utils import handler as h
import utils.queries as q
import utils.scopus_functions as f
import utils.examples as ex
import utils.query_helpers as qh

In [None]:
# Set up logging

# create logger with 'main'
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler('main.log')
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s - [%(module)s|%(funcName)s] - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

In [None]:
import pandas as pd

pd.options.display.max_colwidth = 1_000
pd.options.display.max_columns = 1_000

In [None]:
# author = 'mergoni'
max_date = '2021-02-01'
# file_name_prefix = author + '_scopus_'

## Retrieve original results from Scopus API

In [None]:
example_query = ex.mergoni_2021_scopus_query

In [None]:
example_query

In [None]:
original_df = f.retrieve_results_from_list_of_queries(
    list_of_queries = [example_query],
    max_date = max_date)

In [None]:
# Drop the column of localization in title, abstract or keywords
# because this data has not yet been retrieved
original_df.drop(
    columns=['localization_in_title_abstract_or_key'],
    inplace=True)

In [None]:
print(f'Lenght of original df: {len(original_df)}')

## Retrieve results after applying language-bias-helper

In [None]:
lang_bias_helper_df = f.retrieve_results_from_list_of_queries(
    list_of_queries = [qh.language_bias_tool(example_query)],
    max_date = max_date)

In [None]:
# Drop the column of localization in title, abstract or keywords
# because this data has not yet been retrieved
lang_bias_helper_df.drop(
    columns=['localization_in_title_abstract_or_key'],
    inplace=True)

In [None]:
print(f'Lenght of original df: {len(original_df)}')
print(f'Lenght of language-bias-helper df: {len(lang_bias_helper_df)}')

## Results after applying publication-bias-helper

In [None]:
pub_bias_helper_df = f.retrieve_results_from_list_of_queries(
    list_of_queries = [qh.publication_bias_tool(example_query)],
    max_date = max_date)

In [None]:
print(f'Lenght of original df: {len(original_df)}')
print(f'Lenght of publication-bias-helper df: {len(pub_bias_helper_df)}')

In [None]:
pub_bias_helper_df['prism:aggregationType'].value_counts()

## Analyze results with the localization-bias-helper

In [None]:
local_bias_helper_df__weird = f.retrieve_results_from_list_of_queries(
    list_of_queries = q.scopus_local_bias_helper_queries_weird,
    max_date = max_date)

In [None]:
local_bias_helper_df__non_weird = f.retrieve_results_from_list_of_queries(
    list_of_queries = q.scopus_local_bias_helper_queries_non_weird,
    max_date = max_date)

In [None]:
local_bias_helper_df_intersection = local_bias_helper_df__weird[
    local_bias_helper_df__weird['dc:identifier'].isin(
        local_bias_helper_df__non_weird['dc:identifier'])]

In [None]:
local_bias_helper_df_intersection['dc:title']

In [None]:
# Remove false positive from WEIRD table
local_bias_helper_df__weird_curated = local_bias_helper_df__weird.drop(
    index=14).reset_index(drop=True)

In [None]:
local_bias_helper_df = pd.concat(
    [local_bias_helper_df__weird_curated,
     local_bias_helper_df__non_weird]
     ).drop_duplicates()

In [None]:
original_df_with_localization = pd.merge(
    original_df,
    local_bias_helper_df,
    how='left',
    on=original_df.columns.to_list()
    ).fillna(False)

In [None]:
original_df_with_localization.agg(
    {'localization_in_title_abstract_or_key': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

In [None]:
local_bias_helper_df__weird_curated.agg(
    {'localization_in_title_abstract_or_key': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

In [None]:
local_bias_helper_df__non_weird.agg(
    {'localization_in_title_abstract_or_key': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

## Results after applying availability-bias-helper

In [None]:
open_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 1]
closed_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 0]

In [None]:
print(f'Lenght of open access df: {len(open_access_df)}')
print(f'Lenght of closed access df: {len(closed_access_df)}')

In [None]:
# lang_bias_helper_df

In [None]:
# local_bias_helper__non_weird_new_records_df = local_bias_helper_df__non_weird[
#     ~local_bias_helper_df__non_weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# local_bias_helper__weird_new_records_df = local_bias_helper_df__weird[
#     ~local_bias_helper_df__weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# df_dict = {
#     'original': original_df,
#     'lang_bias_helper': lang_bias_helper_df,
#     'local_bias_helper__non_weird': local_bias_helper_df__non_weird,
#     'local_bias_helper__weird': local_bias_helper_df__weird,
#     'lang_bias_helper_new_records': lang_bias_helper_new_records_df,
#     'local_bias_helper__non_weird_new_records': local_bias_helper__non_weird_new_records_df,
#     'local_bias_helper__weird_new_records': local_bias_helper__weird_new_records_df
#     }

In [None]:
# if h.save_to_csv:
#     f.export_to_csv(original_df, file_name_prefix + 'original')
#     f.export_to_csv(lang_bias_helper_df, file_name_prefix + 'lang_bias_helper')
#     f.export_to_csv(local_bias_helper_df__weird, file_name_prefix + 'local_bias_helper__weird')
#     f.export_to_csv(local_bias_helper_df__non_weird, file_name_prefix + 'local_bias_helper__non_weird')
#     f.export_to_csv(lang_bias_helper_new_records_df, file_name_prefix + 'lang_bias_helper_new_records')
#     f.export_to_csv(local_bias_helper__non_weird_new_records_df,
#                     file_name_prefix + 'local_bias_helper__non_weird_new_records')
#     f.export_to_csv(local_bias_helper__weird_new_records_df,
#                     file_name_prefix + 'local_bias_helper__weird_new_records')