In [None]:
%rm ../main.log
%rm ../dump.json

In [None]:
%run ../env_variables.py

In [None]:
from resources.examples import mergoni_2021_scopus_query, mergoni_2021_max_date
from resources.querying_tools import (
    language_bias_tool,
    publication_bias_tool,
    localization_bias_tool,
)
from resources.scopus_functions import (
    retrieve_results_from_list_of_queries,
    columns_to_hide,
)
import logging

In [None]:
# Set up logging
# Create logger with 'main'
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
# Create file handler which logs even debug messages
fh = logging.FileHandler('main.log')
fh.setLevel(logging.DEBUG)
# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# Create formatter and add it to the handlers
formatter = logging.Formatter(
    '%(asctime)s - [%(module)s|%(funcName)s] - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

In [None]:
original_query = mergoni_2021_scopus_query
max_date = mergoni_2021_max_date

In [None]:
def load_data(query, max_date):
    data = retrieve_results_from_list_of_queries(
        list_of_queries=[query], max_date=max_date
    )
    return data

In [None]:
data_original = load_data(
        original_query, max_date
        )

In [None]:
data_original_to_display = data_original.drop(columns=columns_to_hide)

In [None]:
import pandas as pd

pd.options.display.max_colwidth = 1_000
pd.options.display.max_columns = 1_000

In [None]:
data_original_to_display.head()

## language-bias-tool

In [None]:
lang_bias_query = language_bias_tool(original_query)
pub_bias_query = publication_bias_tool(original_query)

In [None]:
data_lang = load_data(lang_bias_query, max_date)

In [None]:
data_lang_diff = data_lang[
    ~data_lang['dc:identifier'].isin(
        data_original['dc:identifier'])].reset_index(drop=True)

In [None]:
data_lang_diff_to_display = data_lang_diff.drop(
    columns=columns_to_hide)

In [None]:
data_lang_diff_to_display.head()

## publication-bias-tool

In [None]:
data_pub = load_data(pub_bias_query, max_date)

In [None]:
data_pub_diff = data_pub[
    ~data_pub['dc:identifier'].isin(
        data_original['dc:identifier'])].reset_index(drop=True)

In [None]:
data_pub_diff_to_display = data_pub_diff.drop(
    columns=columns_to_hide)

In [None]:
data_pub_diff_to_display.head()

## localization-bias-tool

In [None]:
data_localized = localization_bias_tool(original_query, max_date)

In [None]:
data_localized_to_display = data_localized.drop(
    columns=columns_to_hide)

In [None]:
data_localized_to_display.shape

In [None]:
data_localized_to_display.agg(
    {'localized_weird': ['sum', 'count'],
     'localized_no_weird': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

In [None]:
data_localized_to_display[data_localized_to_display['localized_weird']].agg(
    {'localization_in_title': ['sum', 'count']})

In [None]:
data_localized_to_display[data_localized_to_display['localized_no_weird']].agg(
    {'localization_in_title': ['sum', 'count']})

In [None]:
data_localized_to_display.to_csv('data_localized.csv', index=False)

In [None]:
! pip install GeonamesCache

In [None]:
import geonamescache

gc = geonamescache.GeonamesCache()
countries = gc.get_countries_by_names()
continents = gc.get_continents()
cities = gc.get_cities_by_name()

In [None]:
for country_name, country_info in countries.items():
    print(country_name)

In [None]:
gc = geonamescache.GeonamesCache()
countries = [country['name'] for country in gc.get_countries().values()]
continents = [continent['name'] for continent in gc.get_continents().values()]
cities = [city['name'] for city in gc.get_cities().values()]

In [None]:
'Of' in cities

In [None]:
import geonamescache
import re

def find_location(text):
    gc = geonamescache.GeonamesCache()
    countries = [country['name'] for country in gc.get_countries().values()]
    continents = [continent['name'] for continent in gc.get_continents().values()]
    cities = [city['name'] for city in gc.get_cities().values()]

    found = {
        'countries': [],
        'continents': [],
        'cities': []
    }

    for country_name in countries:
        if re.search(r'\b{}\b'.format(re.escape(country_name)), text):
            found['countries'].append(country_name)

    for continent_name in continents:
        if re.search(r'\b{}\b'.format(re.escape(continent_name)), text):
            found['continents'].append(continent_name)

    for city_name in cities:
        if re.search(r'\b{}\b'.format(re.escape(city_name)), text):
            found['cities'].append(city_name)

    return found



In [None]:
# Example usage:
text_to_check = "I love pancakes, the capital of France, located in Europe."
result = find_location(text_to_check)

print("Countries:", result['countries'])
print("Continents:", result['continents'])
print("Cities:", result['cities'])


## Results after applying availability-bias-tool

In [None]:
open_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 1]
closed_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 0]

In [None]:
print(f'Lenght of open access df: {len(open_access_df)}')
print(f'Lenght of closed access df: {len(closed_access_df)}')

In [None]:
# lang_bias_tool_df

In [None]:
# local_bias_tool__non_weird_new_records_df = local_bias_tool_df__non_weird[
#     ~local_bias_tool_df__non_weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# local_bias_tool__weird_new_records_df = local_bias_tool_df__weird[
#     ~local_bias_tool_df__weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# df_dict = {
#     'original': original_df,
#     'lang_bias_tool': lang_bias_tool_df,
#     'local_bias_tool__non_weird': local_bias_tool_df__non_weird,
#     'local_bias_tool__weird': local_bias_tool_df__weird,
#     'lang_bias_tool_new_records': lang_bias_tool_new_records_df,
#     'local_bias_tool__non_weird_new_records': local_bias_tool__non_weird_new_records_df,
#     'local_bias_tool__weird_new_records': local_bias_tool__weird_new_records_df
#     }

In [None]:
# if h.save_to_csv:
#     f.export_to_csv(original_df, file_name_prefix + 'original')
#     f.export_to_csv(lang_bias_tool_df, file_name_prefix + 'lang_bias_tool')
#     f.export_to_csv(local_bias_tool_df__weird, file_name_prefix + 'local_bias_tool__weird')
#     f.export_to_csv(local_bias_tool_df__non_weird, file_name_prefix + 'local_bias_tool__non_weird')
#     f.export_to_csv(lang_bias_tool_new_records_df, file_name_prefix + 'lang_bias_tool_new_records')
#     f.export_to_csv(local_bias_tool__non_weird_new_records_df,
#                     file_name_prefix + 'local_bias_tool__non_weird_new_records')
#     f.export_to_csv(local_bias_tool__weird_new_records_df,
#                     file_name_prefix + 'local_bias_tool__weird_new_records')