# Annotate the leftover variants from Rafique table using PubMed 

In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import re
import json
import time
import ast

## 1. Select the unannotated variants from the annotated Rafique table

In [None]:
Rafique_annotated = pd.read_csv('Rafique_with_rs.csv', 
                                converters={i: str for i in range(11)}, low_memory=False)
Rafique_annotated

In [None]:
ids_without_nan = []
for item in Rafique_annotated['ensembl_id']:
    if str(item).startswith('rs'): 
        ids_without_nan.append(item)
    else: 
        item = ''
        ids_without_nan.append(item)  
Rafique_annotated['ensembl_id'] = ids_without_nan

In [None]:
leftover_var_Rafique = Rafique_annotated[Rafique_annotated['ensembl_id'] == ''].reset_index(drop=True)
leftover_var_Rafique

In [None]:
leftover_var_Rafique.to_csv(
    'Rafique_without_rs.csv', index=False, header=True)

In [None]:
#make a list of references to extract PMIDS

ref_list = []
for ref in leftover_var_Rafique['Reference']:
    if '-' in ref:
        norm_ref = ref[1:]
        ref_list.append(norm_ref)
    if '(' in ref:
        for x in re.findall('[0-9]+', ref):
            ref_list.append(x)

left_refs_int = [eval(i) for i in list(set(ref_list))]
left_refs_int

In [None]:
pd.DataFrame(left_refs_int).to_csv(
    'leftover_refs.csv', index=False, header=True)

## 2. Take the bibliography from Rafique and extract paper titles

Take the bibliography from Supplementary 2, it is not the same as in the paper!
Create a dataframe with tites and their numbers in bibliography to match to the numbers in the supplimentary table

In [None]:
bibliography = open(
    'input/references_Rafique.txt').readlines()
clean_bibliography = []
for line in bibliography:
    clean_bibliography.append(line.replace('[', '').replace(']', '').replace('?', '.').replace('!', '.'))
columns = ['number', 'title']
df_data = []
for line in clean_bibliography:
    number = line.split('.')[0]
    title = ' '.join(line.split('.')[2:-3])
    df_data.append([number, title])
number_titles = pd.DataFrame(data=df_data, columns=columns)
number_titles

In [None]:
number_titles.to_csv(
    'whole_pipeline_311022/bibliography_df.csv', index=False, header=True)

Query PubMed API to get PMIDs based og the titles

In [None]:
result_dict = {'number':[], 'PMID':[]}
#results_list = []
db = 'pubmed'
domain = 'https://www.ncbi.nlm.nih.gov/entrez/eutils'
nresults = 10
for index,row in number_titles.iterrows():
    query = row['title']
    number = row['number'] #pass the numbers to the results to know where is what
    retmode='json'
    # standard query
    queryLinkSearch = f'{domain}/esearch.fcgi?db={db}&retmax={nresults}&retmode={retmode}&term={query}'
    response = requests.get(queryLinkSearch)
    
    #extract the idlists and add them to the dataframe along with the numbers
    result_json = response.json()
    
    PMID = result_json['esearchresult']['idlist']   
    result_dict['number'].append(number)
    result_dict['PMID'].append(PMID)

    time.sleep(1)
    print(number)

In [None]:
PMIDs = pd.DataFrame(result_dict)
PMIDs['title'] = number_titles['title']
PMIDs

In [None]:
PMIDs.to_csv('PMIDs_queried.csv', index=False, header=True)

Unfortunately, a lot of IDs have not been sucsessfully fetched, so one needs to look them up and add them manually.

In [None]:
PMIDs_curated = pd.read_csv('input/PMIDs_curated.csv')
justIDs = []
for i in PMIDs_curated['PMID']:
    justIDs.append(i[2:-2])
PMIDs_curated['PMID'] = justIDs
PMIDs_curated

In [None]:
# filter PMIDs to the ones left over from the mapping
leftover_IDs = PMIDs_curated.query('number in @left_refs_int').reset_index(drop=True)
leftover_IDs

Fetch variants with Ensembl API: takes PMID and returns rs identifiers of the variants reported in those papers

In [None]:
bad_IDs = []
PMID_mapping = []
passed_IDs = []

In [None]:
server = "https://rest.ensembl.org"

for ID in leftover_IDs['PMID']:
    if ID not in passed_IDs:
        ext = "/variation/human/pmid/" + str(ID) + "?"
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print("bad" + str(ID))
            bad_IDs.append(ID)
            continue
        var_decoded = r.json()
        rs_list = []
        for variant in var_decoded:
            rs_list.append(variant['name'])
        PMID_mapping.append({'PMID': ID, 'rs': rs_list})
        time.sleep(1)
        print(ID)

In [None]:
PMID_mapping_df = pd.DataFrame(PMID_mapping)
PMID_mapping_df

In [None]:
PMID_mapping_df.to_csv(
    'extracted_rs_with_PMIDs_Rafique.csv',
    header=True, index=False)

In [None]:
#list of extracted rs
rs_list = []
for item in PMID_mapping_df['rs']:
    for rs in item:
        rs_list.append(rs)
len(rs_list)

Map the extracted variants to the reference Ensembl table

In [None]:
ref_Ens = pd.read_csv(
    'Ens_filtered_all_alleles_location_coord_no_duplicates.csv'
    , converters={'alleles': ast.literal_eval}, low_memory=False)

In [None]:
# Filtering the Ensembl table to only those variants
mapped_variants = ref_Ens.drop_duplicates().query('id in @rs_list').reset_index(drop=True)
mapped_variants

In [None]:
# Writing to file specifying that it is the 2nd stage of annotation
mapped_variants.to_csv(
    'Rafique_mapped_to_Ens_2nd.csv',
    header=True, index=False)

## 3. Dealing with bad IDs

Which PMIDs did not return any variants and which references are this

In [None]:
bad_IDs

In [None]:
bad_ref = []
for index,row in leftover_IDs.iterrows():
    if row['PMID'] in bad_IDs:
        bad_ref.append(row['number'])
bad_ref

In [None]:
leftover_var_Rafique

In [None]:
#Add a new column with references as list items
ref_list_new_column = []
for ref in leftover_var_Rafique['Reference']:
    
    if '-' in ref:
        norm_ref = [int(ref[1:])]
        ref_list_new_column.append(norm_ref)
        
    elif '(' in ref:
        ref_list_new_column.append(ref.replace('(', '').replace(')', '').split(', '))
        
    else: ref_list_new_column.append([])
ref_list_new_column

In [None]:
leftover_var_Rafique['Reference_lists'] = ref_list_new_column
leftover_var_Rafique

In [None]:
index_list = []
for index, row in leftover_var_Rafique.iterrows():
    for bad_reference in bad_ref:
        if bad_reference in row['Reference_lists']:
            index_list.append(index)
Rafique_var_for_manual_rescue = leftover_var_Rafique[leftover_var_Rafique.index.isin(index_list)].reset_index(drop=True)
Rafique_var_for_manual_rescue

In [None]:
Rafique_var_for_manual_rescue.to_csv(
    'Rafique_var_for_manual_rescue.csv',
    header=True, index=False)