In [1]:
%rm ../main.log
%rm ../dump.json

In [2]:
%run ../env_variables.py

In [3]:
from resources.examples import mergoni_2021_scopus_query, mergoni_2021_max_date
from resources.querying_tools import (
    language_bias_tool,
    publication_bias_tool,
)
from resources.scopus_functions import (
    retrieve_results_from_list_of_queries,
    columns_to_hide,
)
import logging

In [4]:
# Set up logging
# Create logger with 'main'
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
# Create file handler which logs even debug messages
fh = logging.FileHandler('main.log')
fh.setLevel(logging.DEBUG)
# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# Create formatter and add it to the handlers
formatter = logging.Formatter(
    '%(asctime)s - [%(module)s|%(funcName)s] - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

In [5]:
original_query = mergoni_2021_scopus_query
max_date = mergoni_2021_max_date

In [6]:
def load_data(query, max_date):
    data = retrieve_results_from_list_of_queries(
        list_of_queries=[query], max_date=max_date
    )
    return data

In [None]:
# Set up logging

# create logger with 'main'
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
# create file handler which logs even debug messages
fh = logging.FileHandler('main.log')
fh.setLevel(logging.DEBUG)
# create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# create formatter and add it to the handlers
# formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = logging.Formatter('%(asctime)s - [%(module)s|%(funcName)s] - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

In [None]:
import pandas as pd

pd.options.display.max_colwidth = 1_000
pd.options.display.max_columns = 1_000

In [None]:
# author = 'mergoni'
max_date = '2021-02-01'
# file_name_prefix = author + '_scopus_'

## Retrieve original results from Scopus API

In [None]:
example_query = ex.mergoni_2021_scopus_query

In [None]:
example_query

In [None]:
original_df = f.retrieve_results_from_list_of_queries(
    list_of_queries = [example_query],
    max_date = max_date)

In [None]:
# Drop the column of localization in title, abstract or keywords
# because this data has not yet been retrieved
original_df.drop(
    columns=['localization_in_title_abstract_or_key'],
    inplace=True)

In [None]:
print(f'Lenght of original df: {len(original_df)}')

## Retrieve results after applying language-bias-tool

In [None]:
lang_bias_tool_df = f.retrieve_results_from_list_of_queries(
    list_of_queries = [qh.language_bias_tool(example_query)],
    max_date = max_date)

In [None]:
# Drop the column of localization in title, abstract or keywords
# because this data has not yet been retrieved
lang_bias_tool_df.drop(
    columns=['localization_in_title_abstract_or_key'],
    inplace=True)

In [None]:
print(f'Lenght of original df: {len(original_df)}')
print(f'Lenght of language-bias-tool df: {len(lang_bias_tool_df)}')

## Results after applying publication-bias-tool

In [None]:
pub_bias_tool_df = f.retrieve_results_from_list_of_queries(
    list_of_queries = [qh.publication_bias_tool(example_query)],
    max_date = max_date)

In [None]:
print(f'Lenght of original df: {len(original_df)}')
print(f'Lenght of publication-bias-tool df: {len(pub_bias_tool_df)}')

In [None]:
pub_bias_tool_df['prism:aggregationType'].value_counts()

## Analyze results with the localization-bias-tool

In [None]:
local_bias_tool_df__weird = f.retrieve_results_from_list_of_queries(
    list_of_queries = q.scopus_local_bias_tool_queries_weird,
    max_date = max_date)

In [None]:
local_bias_tool_df__non_weird = f.retrieve_results_from_list_of_queries(
    list_of_queries = q.scopus_local_bias_tool_queries_non_weird,
    max_date = max_date)

In [None]:
local_bias_tool_df_intersection = local_bias_tool_df__weird[
    local_bias_tool_df__weird['dc:identifier'].isin(
        local_bias_tool_df__non_weird['dc:identifier'])]

In [None]:
local_bias_tool_df_intersection['dc:title']

In [None]:
# Remove false positive from WEIRD table
local_bias_tool_df__weird_curated = local_bias_tool_df__weird.drop(
    index=14).reset_index(drop=True)

In [None]:
local_bias_tool_df = pd.concat(
    [local_bias_tool_df__weird_curated,
     local_bias_tool_df__non_weird]
     ).drop_duplicates()

In [None]:
original_df_with_localization = pd.merge(
    original_df,
    local_bias_tool_df,
    how='left',
    on=original_df.columns.to_list()
    ).fillna(False)

In [None]:
original_df_with_localization.agg(
    {'localization_in_title_abstract_or_key': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

In [None]:
local_bias_tool_df__weird_curated.agg(
    {'localization_in_title_abstract_or_key': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

In [None]:
local_bias_tool_df__non_weird.agg(
    {'localization_in_title_abstract_or_key': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

## Results after applying availability-bias-tool

In [None]:
open_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 1]
closed_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 0]

In [None]:
print(f'Lenght of open access df: {len(open_access_df)}')
print(f'Lenght of closed access df: {len(closed_access_df)}')

In [None]:
# lang_bias_tool_df

In [None]:
# local_bias_tool__non_weird_new_records_df = local_bias_tool_df__non_weird[
#     ~local_bias_tool_df__non_weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# local_bias_tool__weird_new_records_df = local_bias_tool_df__weird[
#     ~local_bias_tool_df__weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# df_dict = {
#     'original': original_df,
#     'lang_bias_tool': lang_bias_tool_df,
#     'local_bias_tool__non_weird': local_bias_tool_df__non_weird,
#     'local_bias_tool__weird': local_bias_tool_df__weird,
#     'lang_bias_tool_new_records': lang_bias_tool_new_records_df,
#     'local_bias_tool__non_weird_new_records': local_bias_tool__non_weird_new_records_df,
#     'local_bias_tool__weird_new_records': local_bias_tool__weird_new_records_df
#     }

In [None]:
# if h.save_to_csv:
#     f.export_to_csv(original_df, file_name_prefix + 'original')
#     f.export_to_csv(lang_bias_tool_df, file_name_prefix + 'lang_bias_tool')
#     f.export_to_csv(local_bias_tool_df__weird, file_name_prefix + 'local_bias_tool__weird')
#     f.export_to_csv(local_bias_tool_df__non_weird, file_name_prefix + 'local_bias_tool__non_weird')
#     f.export_to_csv(lang_bias_tool_new_records_df, file_name_prefix + 'lang_bias_tool_new_records')
#     f.export_to_csv(local_bias_tool__non_weird_new_records_df,
#                     file_name_prefix + 'local_bias_tool__non_weird_new_records')
#     f.export_to_csv(local_bias_tool__weird_new_records_df,
#                     file_name_prefix + 'local_bias_tool__weird_new_records')