In [155]:
import pandas as pd
import urllib
import requests
from pathlib import Path
import numpy as np
import os
import json
from bs4 import BeautifulSoup
import re

# Metadata Cleaning

1. Unify the file formats (i.e., `json` --> `csv`)
1. Merge metadata from multiple sources (i.e., NIH Common Fund repositories and Database Commons)
1. Add URLs of resources if missing (i.e., Journal homepages using `Sourceid`)
1. Add our own IDs for individual resources by updating (`global_data-portal_id_map.csv`)
1. Add resource connection status

// old 
Not all existing resources are maintained. We want to first filter out resources that are not working in order to run evaluation with smaller sets of resources.

This notebook uses files under the `input` folder (e.g., `database-commons-nov-21-2023.json`) and generates filtered files under `output` (e.g., `portals-filtered-nov-21-2023.json`).

**To-Do**
- [ ] Merge NIH data portals

In [109]:
EVALUATION_DATE_FOLDER = 'Nov-21-2023'

## Data Portals

In [141]:
# Load the original data
df = pd.read_json(os.path.join('../input', EVALUATION_DATE_FOLDER, 'database-commons.json'))

df = df.head(10) # for testing purposes

In [142]:
# We use underscore lowercase column names
df.columns = (df.columns.str.replace('(?<=[a-z])(?=[A-Z])', '_', regex=True).str.lower())

In [143]:
# Drop columns that we don't need
df.drop(columns=['biodb_ranks', 'rating_list'], inplace=True)

In [144]:
# The id from the sources are consistently "source_id"
# The values should be a string type, and it has the prefix that represents the source (e.g. dc_ for Database Commons)
df.rename(columns={ "db_id": "source_id" }, inplace=True)
df.source_id = df.source_id.apply(lambda x: 'dc_' + str(x))

In [145]:
# Some columns from data commons are in json format, we need to convert them to string
# Example: [{ "id": 1, "name": "foo" }, { "id": 2, "name": "bar" }] --> 'foo, bar'
json_column_names_and_keys = {
    'data_type_list': 'datatypeName', 
    'category_list': 'name',
    'keywords_list': 'name',
    'data_object_list': 'name',
    'organism_list': 'organismName',
    'theme_list': 'name'
}

for (column, key) in json_column_names_and_keys.items():
    df[column] = df[column].apply(lambda x: ', '.join([object[key] for object in x]))

In [146]:
# Check the connection status and put that as a `reachable` column

# TODO: a faster way to do this?
def check_connection_status(url):
    print(url)
    try:
        status = requests.get(url)
    except Exception:
        return False
    return status.status_code == 200
    
df['reachable'] = df['url'].apply(lambda x: check_connection_status(x))

https://david.ncifcrf.gov
https://www.kegg.jp
http://cbioportal.org
https://string-db.org/
https://www.encodeproject.org/
https://www.uniprot.org
https://www.internationalgenome.org
http://pfam.xfam.org/
http://www.arb-silva.de
https://gnomad.broadinstitute.org/


In [179]:
# create or update a mapping table (i.e., id <==> source_id)

# Load the data first
file = Path(os.path.join('../output', 'global_data-portal_id_map.csv'))
if file.exists():
    df_map = pd.read_csv(file)
else:
    df_map = pd.DataFrame(columns=['id', 'source_id', 'date_added'])

# Find rows that does not already existi in the table
df_temp = df_map.merge(df, how='outer', on='source_id', indicator=True)
df_new_resources = df_temp[df_temp['_merge'] == 'right_only'][['source_id']]

# New `id` should be the max `id` + 1
max_id = df_map.id.max()
new_id = 1 if max_id is np.nan else max_id + 1
df_new_resources.insert(0, 'id', range(new_id, new_id + len(df)))
df_new_resources['date_added'] = pd.to_datetime('today').strftime('%m-%d-%Y')

pd.concat([df_map, df_new_resources], axis=0).to_csv(file, index=False)

['dc_238', 'dc_6934']

In [None]:
RESOURCES = [
    (
        'portal',                            # category
        'database-commons.json',             # input file
        'portals-metadata.json',             # output file
        'portals-manually-selected.json',    # a list of manually chosen websites. None if `None`
    ),
    (
        'journal',                           # category
        'sjr.json',                          # input file
        'journals-metadata.json',            # output file
        None,                                # a list of manually chosen websites. None if `None`
    )
]

## Add Connection Status
We want to filter out resources that are no longer working in the evaluation, so add such information.

In [None]:
def connection_status(x):
    try:
        return requests.get(x).status_code
    except requests.exceptions.ConnectionError:
        return -1

def check_webpage(url):
    try:
        # https://stackoverflow.com/questions/51972160/python-check-if-website-exists-for-a-list-of-websites
        conn = urllib.request.urlopen(url, timeout=1)
    except urllib.error.HTTPError as e:
        return e.code
    except urllib.error.URLError as e:
        return e.reason
    except Exception:
        return -1
    else:
        return 200

In [None]:
"""
Using `Sourceid` of SJR, get URLs of individual journal portals
"""
def infer_homepage(Sourceid):
    info_url = f'https://www.scimagojr.com/journalsearch.php?q={Sourceid}&tip=sid&clean=0'
    html_text = requests.get(info_url).text
    soup = BeautifulSoup(html_text, 'html.parser')
    urls = soup.find_all('a', text=re.compile('Homepage'))
    if len(urls) > 0:
        return urls[0].get('href')
    else:
        print(f'No homepage found for {Sourceid}')
        return None

In [None]:
for (
    category, 
    input_file, 
    output_file,
    manual_file
) in RESOURCES:
    
    input_path = f'../input/{input_file}'
    manual_path = f'../input/{manual_file}'
    output_path = f'../output/{output_file}'

    if os.path.isfile(output_path):
        # it looks like there already is an output file
        continue
    
    df = pd.read_json(input_path)
    
    if category == 'portal':
        # actual data is stored under `data`.
        df = pd.DataFrame.from_dict(df.data.to_dict(), orient='index')
    elif category == 'visualization':
        # actual data is stored under `tools`.
        df = pd.DataFrame.from_dict(df.tools.to_dict(), orient='index')
        df = df[df['platform'].map(lambda x: hasattr(x, "__len__") and 'Web' in x)]
    elif category == 'journal':
        df['url'] = df['Sourceid'].apply(lambda x: infer_homepage(x))
        
    if manual_file is not None:
        # Add manually chosen webpages
        manual_selection = pd.read_json(manual_path)
        df = df.append(manual_selection)
    
    df['connection'] = df['url'].apply(lambda x: check_webpage(x))
    
    # select websites that are able to connect
    df = df[df.connection == 200]

    df.to_json(output_path, orient="records")