In [1]:
import pandas as pd
import urllib
import requests
import os
from bs4 import BeautifulSoup
import re

# Data Preprocess
Use the files under the `input/original` folder to generate metadata files under `output` (e.g., `portals-metadata.json`).

In [None]:
RESOURCES = [
    (
        'portal',                            # category
        'database-commons.json',             # input file
        'portals-metadata.json',             # output file
        'portals-manually-selected.json',    # a list of manually chosen websites. None if `None`
    ),
    (
        'visualization',                     # category
        'awesome-genome-visualization.json', # input file
        'visualizations-metadata.json',      # output file
        None,                                # a list of manually chosen websites. None if `None`
    ),
    (
        'journal',                           # category
        'sjr.json',                          # input file
        'journals-metadata.json',            # output file
        None,                                # a list of manually chosen websites. None if `None`
    )
]

## Add Connection Status
We want to filter out resources that are no longer working in the evaluation, so add such information.

In [None]:
def connection_status(x):
    try:
        return requests.get(x).status_code
    except requests.exceptions.ConnectionError:
        return -1

def check_webpage(url):
    try:
        # https://stackoverflow.com/questions/51972160/python-check-if-website-exists-for-a-list-of-websites
        conn = urllib.request.urlopen(url, timeout=1)
    except urllib.error.HTTPError as e:
        return e.code
    except urllib.error.URLError as e:
        return e.reason
    except Exception:
        return -1
    else:
        return 200

In [None]:
"""
Using `Sourceid` of SJR, get URLs of individual journal portals
"""
def infer_homepage(Sourceid):
    info_url = f'https://www.scimagojr.com/journalsearch.php?q={Sourceid}&tip=sid&clean=0'
    html_text = requests.get(info_url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    urls = soup.find_all('a', text=re.compile('Homepage'))
    if len(urls) > 0:
        return urls[0].get('href')
    else:
        print(f'No homepage found for {Sourceid}')
        return None

In [None]:
for (
    category, 
    input_file, 
    output_file,
    manual_file
) in RESOURCES:
    
    input_path = f'../input/{input_file}'
    manual_path = f'../input/{manual_file}'
    output_path = f'../output/{output_file}'

    if os.path.isfile(output_path):
        # it looks like there already is an output file
        continue
    
    df = pd.read_json(input_path)
    
    if category == 'portal':
        # actual data is stored under `data`.
        df = pd.DataFrame.from_dict(df.data.to_dict(), orient='index')
    elif category == 'visualization':
        # actual data is stored under `tools`.
        df = pd.DataFrame.from_dict(df.tools.to_dict(), orient='index')
        df = df[df['platform'].map(lambda x: hasattr(x, "__len__") and 'Web' in x)]
    elif category == 'journal':
        df['url'] = df['Sourceid'].apply(lambda x: infer_homepage(x))
        
    if manual_file is not None:
        # Add manually chosen webpages
        manual_selection = pd.read_json(manual_path)
        df = df.append(manual_selection)
    
    df['connection'] = df['url'].apply(lambda x: check_webpage(x))
    
    # select websites that are able to connect
    df = df[df.connection == 200]

    df.to_json(output_path, orient="records")