## Parse the submitter table and convert to csv file

In [144]:
%run parseSubmittersTable.py

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6]
1:"Submitter"
2:"Maximum review status"
3:"Total submissions"
4:"Submissions with interpretations"
5:"Total Genes"
6:"Last updated"


In [232]:
# convert data to 
Dict={title:column for (title,column) in col}
submitters=pd.DataFrame(Dict)
submitters.head()

Unnamed: 0,Submitter,Maximum review status,Total submissions,Submissions with interpretations,Total Genes,Last updated
0,Invitae,Assertion criteria,392880,392877,12724,"Feb 06, 2020"
1,Illumina Clinical Services Laboratory; Illumina,Assertion criteria,208171,208171,2299,"Jul 16, 2020"
2,GeneDx,Assertion criteria,121894,121760,26670,"Sep 10, 2020"
3,Ambry Genetics,Assertion criteria,70620,70620,1339,"Jul 28, 2020"
4,EGL Genetic Diagnostics; Eurofins Clinical Dia...,Assertion criteria,45028,45028,2406,"Sep 19, 2018"


## Parse sumitters own page 
First we read the HTML page that we want to parse. In this case it's the submitters table for clinvar.

In [233]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
html = urlopen(url) 
soup = BeautifulSoup(html, 'html')
# prettify is used to correct elements in html page
fixed_html = soup.prettify() 
links = soup.find('table')
submitters_id = []
for link in links.find_all('a'): 
    path = link.get('href')
    submitters_id.append(path.split('/')[3])

# append the submitters id to a dataframe 
submitters['submitters_id'] = submitters_id
submitters

Unnamed: 0,Submitter,Maximum review status,Total submissions,Submissions with interpretations,Total Genes,Last updated,submitters_id
0,Invitae,Assertion criteria,392880,392877,12724,"Feb 06, 2020",500031
1,Illumina Clinical Services Laboratory; Illumina,Assertion criteria,208171,208171,2299,"Jul 16, 2020",504895
2,GeneDx,Assertion criteria,121894,121760,26670,"Sep 10, 2020",26957
3,Ambry Genetics,Assertion criteria,70620,70620,1339,"Jul 28, 2020",61756
4,EGL Genetic Diagnostics; Eurofins Clinical Dia...,Assertion criteria,45028,45028,2406,"Sep 19, 2018",500060
...,...,...,...,...,...,...,...
1690,Genetic Outpatient Clinic; Children's Memorial...,Assertion criteria,1,1,1,"May 03, 2019",506949
1691,"Center of Clinical Laboratory, Zhongshan Hospi...",Assertion criteria,1,1,1,"Jun 10, 2019",506954
1692,"Department of Biology; College of Science, Bag...",-,1,1,1,"Jul 26, 2019",507059
1693,The Genetics Institute; Kaplan Medical center,-,1,1,1,"Jun 11, 2019",507062


## Download the html pages for each submitter
We need to download the html files locally because the NCBI blocks the access whenerver I try to scrap the list of submitters.

In [None]:
import requests
import time

for submitter_id in submitters.submitters_id: 
    get_html(submitter_id, "./htmls")
    print(submitter_id)
    time.sleep(5)
    

## Parse each submitter's page
The folowing snipped, extracts the countries from the html file of each submitter. The adress info are formatted under the class 'col_four_col' which I'm exploring to extract the countries data. I have tried 'Spacy' and 'geograpy' libraries but seems that they don't recognise the countries names neither from the non formatted html content of even after extracting the text with beautiful soup. But geotext library seems to work well. I had to try different separators. The only one that works propoerly is the return to line `\n` character. Sometimes, there would be some confusions as the names of some cities correspond to countries. The appended content to the `countries_full_name` and `countries_alpha3_code` lists is from geotext database to normalize the names and the alpha_3 codes of the countries.

In [152]:
import pycountry
from geotext import GeoText

countries_full_name = []
countries_alpha3_code = []

base_url = "./htmls/"
countries=[]
for idx, submitter_id in enumerate(submitters.submitters_id): 
    url_submitter = base_url+submitter_id+".html"
    #html = urlopen(url_submitter)
    try:
        soup = BeautifulSoup(open(url_submitter), 'html')
    except: 
        get_html(submitter_id, "./htmls")
        url_submitter = base_url+submitter_id+".html"
        soup = BeautifulSoup(open(url_submitter), 'html')
        
    mysoup = soup.find('div', attrs={'class':'col four_col'})
    soup_text = mysoup.get_text('\n')
    #print(soup_text)
    places = GeoText(soup_text)
    try :
        alpha2 = max(places.country_mentions, key=places.country_mentions.get)
        country = pycountry.countries.get(alpha_2=alpha2)
        countries_full_name.append(country.name)
        countries_alpha3_code.append(country.alpha_3)
    except: 
        #print("Failed to extract data for submitter", submitter_id )
        countries_full_name.append('')
        countries_alpha3_code.append('')
        link_to_url_submitter = "https://www.ncbi.nlm.nih.gov/clinvar/submitters/"+submitter_id+"/"
        print(link_to_url_submitter)


https://www.ncbi.nlm.nih.gov/clinvar/submitters/504863/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505698/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505304/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505765/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505696/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505483/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505793/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/504869/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505841/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505460/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/504839/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505344/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505718/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505664/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505442/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505878/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/505205/
https://www.ncbi.nlm.nih.gov/clinvar/submitters/

There are 94 missing data after applying the scrapping algorithm. These were curated namually and saved in the `missing_submitters.csv` file. Now we can add the countries data to the submitters dataframe then replace the missing data with their correspondings from the `missing_submitters.csv` table.

In [234]:
submitters["country_name"] = countries_full_name
submitters["alpha_3"] = countries_alpha3_code

# read the missing annotation
missing_submitters = pd.read_csv("missing_submitters.csv", dtype={'submitters_id': object})
# replace the missing data in the main dataframe 
for raw in missing_submitters.iloc: 
    id = raw.submitters_id
    raw_to_replace = submitters[submitters.submitters_id == id].index
    submitters.at[raw_to_replace[0], 'alpha_3'] = raw.alpha_3
    country_name = pycountry.countries.get(alpha_3=raw.alpha_3 ).name
    submitters.at[raw_to_replace[0], 'country_name'] = country_name

Now we merge the continent data

In [240]:
continents = pd.read_csv("./geodata.csv")
final_table = submitters.merge(continents, how='left', on='alpha_3')
final_table.to_csv("Clinvar_submitters.csv", index=False)

# Extracting the submitters gene/phenotypes tables

Requires downloaded HTML files fo each individuals you want to parse. Here I put them in `htmls`  directory but without uploading it to the repo.

In [3]:
%run extract_gene_disease_tables.py

In [4]:
import pandas as pd
submitters = pd.read_csv("Clinvar_submitters.csv", skiprows=1,  names=["Submitter", "Maximum_review_status",
                                                          "Total_submissions", "Submissions_with_interpretations", 
                                                         "Total_Genes", "Last_updated", "submitters_id", "country_name",  "alpha_3", "continent"])
Africa_snp = submitters[submitters.continent == "Africa"]
Africa_submitters_ids = Africa_snp.submitters_id

genes_df = pd.DataFrame() 
phenotypes_df = pd.DataFrame()

for submitter_id in Africa_submitters_ids: 
    path_to_html_file = "./htmls/"+str(submitter_id)+".html"
    with open(path_to_html_file, 'r') as html: 
        soup = BeautifulSoup(html, 'html')
        # extract tables from beacutiful spip object
        genes, phenotypes = searchTables(soup, gene_table_id_attribute ="gene_sub_table", 
                                        pheno_table_id_attribute = "pheno_sub_table")
        gene_df_submitter = pd.DataFrame( extracHtmlTable(genes) )
        gene_df_submitter["submitters_id"] = [submitter_id]*len(gene_df_submitter)
        phenotype_df_submitter = pd.DataFrame( extracHtmlTable(phenotypes) )
        phenotype_df_submitter["submitters_id"] = [submitter_id]*len(phenotype_df_submitter)
        genes_df = genes_df.append(gene_df_submitter)
        phenotypes_df = phenotypes_df.append(phenotype_df_submitter)
        
# save to csv file 
genes_df.to_csv("clinVar_genes_byAfricans.csv", index=False)
phenotypes_df.to_csv("clinVar_phenotypes_byAfricans.csv", index= False)