In [1]:
%rm ../main.log
%rm ../dump.json

In [2]:
%run ../env_variables.py

In [3]:
from resources.examples import mergoni_2021_scopus_query, mergoni_2021_max_date
from resources.querying_tools import (
    language_bias_tool,
    publication_bias_tool,
    localization_bias_tool,
)
from resources.scopus_functions import (
    retrieve_results_from_list_of_queries,
    columns_to_hide,
)
import logging

In [4]:
# Set up logging
# Create logger with 'main'
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
# Create file handler which logs even debug messages
fh = logging.FileHandler('main.log')
fh.setLevel(logging.DEBUG)
# Create console handler with a higher log level
ch = logging.StreamHandler()
ch.setLevel(logging.ERROR)
# Create formatter and add it to the handlers
formatter = logging.Formatter(
    '%(asctime)s - [%(module)s|%(funcName)s] - %(levelname)s - %(message)s')
fh.setFormatter(formatter)
ch.setFormatter(formatter)
# Add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)

In [5]:
original_query = mergoni_2021_scopus_query
max_date = mergoni_2021_max_date

In [6]:
def load_data(query, max_date):
    data = retrieve_results_from_list_of_queries(
        list_of_queries=[query], max_date=max_date
    )
    return data

In [7]:
data_original = load_data(
        original_query, max_date
        )

In [8]:
data_original_to_display = data_original.drop(columns=columns_to_hide)

In [9]:
import pandas as pd

pd.options.display.max_colwidth = 1_000
pd.options.display.max_columns = 1_000

In [10]:
data_original_to_display.head()

Unnamed: 0,dc:identifier,dc:title,dc:creator,prism:publicationName,prism:coverDate,prism:aggregationType,subtypeDescription,prism:doi,eid
0,SCOPUS_ID:85094315111,Development and validation of an index to measure agricultural sustainability,Valizadeh N.,Journal of Cleaner Production,2021-01-20,Journal,Article,10.1016/j.jclepro.2020.123797,2-s2.0-85094315111
1,SCOPUS_ID:85099582835,The impact of airport managerial type and airline market share on airport efficiency,Park J.H.,Sustainability (Switzerland),2021-01-02,Journal,Article,10.3390/su13020981,2-s2.0-85099582835
2,SCOPUS_ID:85120998293,Manufacturing enterprise performance using network DEA: A profitability and marketability framework,Hanoum S.,International Journal of Business Excellence,2021-01-01,Journal,Article,10.1504/IJBEX.2021.119457,2-s2.0-85120998293
3,SCOPUS_ID:85114670899,Regional Variation in the Carbon Dioxide Emission Efficiency of Construction Industry in China: Based on the Three-Stage DEA Model,Zhou W.,Discrete Dynamics in Nature and Society,2021-01-01,Journal,Retracted,10.1155/2021/4021947,2-s2.0-85114670899
4,SCOPUS_ID:85107945991,Water-energy-food nexus and eco-sustainability: A three-stage dual-boundary network DEA model for evaluating Jiangsu Province in China,Li J.,International Journal of Computational Intelligence Systems,2021-01-01,Journal,Article,10.2991/ijcis.d.210423.005,2-s2.0-85107945991


## language-bias-tool

In [None]:
lang_bias_query = language_bias_tool(original_query)
pub_bias_query = publication_bias_tool(original_query)

In [None]:
data_lang = load_data(lang_bias_query, max_date)

In [None]:
data_lang_diff = data_lang[
    ~data_lang['dc:identifier'].isin(
        data_original['dc:identifier'])].reset_index(drop=True)

In [None]:
data_lang_diff_to_display = data_lang_diff.drop(
    columns=columns_to_hide)

In [None]:
data_lang_diff_to_display.head()

## publication-bias-tool

In [None]:
data_pub = load_data(pub_bias_query, max_date)

In [None]:
data_pub_diff = data_pub[
    ~data_pub['dc:identifier'].isin(
        data_original['dc:identifier'])].reset_index(drop=True)

In [None]:
data_pub_diff_to_display = data_pub_diff.drop(
    columns=columns_to_hide)

In [None]:
data_pub_diff_to_display.head()

## localization-bias-tool

In [11]:
data_localized = localization_bias_tool(original_query, max_date)

In [12]:
data_localized_to_display = data_localized.drop(
    columns=columns_to_hide)

In [13]:
data_localized_to_display.shape

(101, 12)

In [14]:
data_localized_to_display.agg(
    {'localized_weird': ['sum', 'count'],
     'localized_no_weird': ['sum', 'count'],
     'localization_in_title': ['sum', 'count'],
     }
)

Unnamed: 0,localized_weird,localized_no_weird,localization_in_title
sum,32,75,56
count,101,101,101


In [15]:
data_localized_to_display[data_localized_to_display['localized_weird']].agg(
    {'localization_in_title': ['sum', 'count']})

Unnamed: 0,localization_in_title
sum,13
count,32


In [16]:
data_localized_to_display[data_localized_to_display['localized_no_weird']].agg(
    {'localization_in_title': ['sum', 'count']})

Unnamed: 0,localization_in_title
sum,46
count,75


In [17]:
data_localized_to_display.to_csv('data_localized.csv', index=False)

In [21]:
from resources.querying_tools import (
    find_localization_in_text,
    remove_accents_and_special_chars,
)

In [19]:
find_localization_in_text('Operational performance of U.S. public rail transit and implications for public policy')

False

In [23]:
text = remove_accents_and_special_chars('Operational performance of U.S. public rail transit and implications for public policy')

In [24]:
text_words = text.lower().split()

In [25]:
text_words

['operational',
 'performance',
 'of',
 'us',
 'public',
 'rail',
 'transit',
 'and',
 'implications',
 'for',
 'public',
 'policy']

In [26]:
from resources.country_lists import (
        countries, demonyms, continents_names, continents_demonyms,
)

In [28]:
list_of_locations = countries + demonyms + continents_names + continents_demonyms

In [31]:
[location.lower() for location in list_of_locations]

['afghanistan',
 'albania',
 'algeria',
 'andorra',
 'angola',
 'antigua and barbuda',
 'argentina',
 'armenia',
 'australia',
 'austria',
 'azerbaijan',
 'bahamas',
 'bahrain',
 'bangladesh',
 'barbados',
 'belarus',
 'belgium',
 'belize',
 'benin',
 'bhutan',
 'bolivia',
 'bosnia and herzegovina',
 'botswana',
 'brazil',
 'brunei',
 'bulgaria',
 'burkina faso',
 'burundi',
 'cabo verde',
 'cambodia',
 'cameroon',
 'canada',
 'central african republic',
 'chad',
 'chile',
 'china',
 'colombia',
 'comoros',
 'congo',
 'costa rica',
 'croatia',
 'cuba',
 'cyprus',
 'czech republic',
 'czechia',
 'denmark',
 'djibouti',
 'dominica',
 'dominican republic',
 'ecuador',
 'egypt',
 'el salvador',
 'equatorial guinea',
 'eritrea',
 'estonia',
 'eswatini',
 'ethiopia',
 'fiji',
 'finland',
 'france',
 'gabon',
 'gambia',
 'georgia',
 'germany',
 'ghana',
 'greece',
 'grenada',
 'guatemala',
 'guinea',
 'guinea-bissau',
 'guyana',
 'haiti',
 'honduras',
 'hungary',
 'iceland',
 'india',
 'indon

In [30]:
any(location.lower() in text_words for location in list_of_locations)

False

In [None]:
! pip install GeonamesCache

In [None]:
import geonamescache

gc = geonamescache.GeonamesCache()
countries = gc.get_countries_by_names()
continents = gc.get_continents()
cities = gc.get_cities_by_name()

In [None]:
for country_name, country_info in countries.items():
    print(country_name)

In [None]:
gc = geonamescache.GeonamesCache()
countries = [country['name'] for country in gc.get_countries().values()]
continents = [continent['name'] for continent in gc.get_continents().values()]
cities = [city['name'] for city in gc.get_cities().values()]

In [None]:
'Of' in cities

In [None]:
import geonamescache
import re

def find_location(text):
    gc = geonamescache.GeonamesCache()
    countries = [country['name'] for country in gc.get_countries().values()]
    continents = [continent['name'] for continent in gc.get_continents().values()]
    cities = [city['name'] for city in gc.get_cities().values()]

    found = {
        'countries': [],
        'continents': [],
        'cities': []
    }

    for country_name in countries:
        if re.search(r'\b{}\b'.format(re.escape(country_name)), text):
            found['countries'].append(country_name)

    for continent_name in continents:
        if re.search(r'\b{}\b'.format(re.escape(continent_name)), text):
            found['continents'].append(continent_name)

    for city_name in cities:
        if re.search(r'\b{}\b'.format(re.escape(city_name)), text):
            found['cities'].append(city_name)

    return found



In [None]:
# Example usage:
text_to_check = "I love pancakes, the capital of France, located in Europe."
result = find_location(text_to_check)

print("Countries:", result['countries'])
print("Continents:", result['continents'])
print("Cities:", result['cities'])


## Results after applying availability-bias-tool

In [None]:
open_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 1]
closed_access_df = original_df_with_localization[original_df_with_localization['openaccess'] == 0]

In [None]:
print(f'Lenght of open access df: {len(open_access_df)}')
print(f'Lenght of closed access df: {len(closed_access_df)}')

In [None]:
# lang_bias_tool_df

In [None]:
# local_bias_tool__non_weird_new_records_df = local_bias_tool_df__non_weird[
#     ~local_bias_tool_df__non_weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# local_bias_tool__weird_new_records_df = local_bias_tool_df__weird[
#     ~local_bias_tool_df__weird['dc:identifier'].isin(original_df['dc:identifier'])
#     ].copy().reset_index(drop=True)

In [None]:
# df_dict = {
#     'original': original_df,
#     'lang_bias_tool': lang_bias_tool_df,
#     'local_bias_tool__non_weird': local_bias_tool_df__non_weird,
#     'local_bias_tool__weird': local_bias_tool_df__weird,
#     'lang_bias_tool_new_records': lang_bias_tool_new_records_df,
#     'local_bias_tool__non_weird_new_records': local_bias_tool__non_weird_new_records_df,
#     'local_bias_tool__weird_new_records': local_bias_tool__weird_new_records_df
#     }

In [None]:
# if h.save_to_csv:
#     f.export_to_csv(original_df, file_name_prefix + 'original')
#     f.export_to_csv(lang_bias_tool_df, file_name_prefix + 'lang_bias_tool')
#     f.export_to_csv(local_bias_tool_df__weird, file_name_prefix + 'local_bias_tool__weird')
#     f.export_to_csv(local_bias_tool_df__non_weird, file_name_prefix + 'local_bias_tool__non_weird')
#     f.export_to_csv(lang_bias_tool_new_records_df, file_name_prefix + 'lang_bias_tool_new_records')
#     f.export_to_csv(local_bias_tool__non_weird_new_records_df,
#                     file_name_prefix + 'local_bias_tool__non_weird_new_records')
#     f.export_to_csv(local_bias_tool__weird_new_records_df,
#                     file_name_prefix + 'local_bias_tool__weird_new_records')