# Creating a Reference table from Ensembl

This notebook takes a user list of gene names and returns a table 
from Ensembl with variants in the exons of the given genes

In [None]:
import requests, sys
import pandas as pd
import json
import pprint
import ast
import time
import ast

## 1. Import a list of genes, look up their ENSG, ENST and ENSE identifiers

In [None]:
#list of genes

genefile = open('input/big_MD_gene_list.txt', 'r')
genelist = []
for line in genefile:
    genelist.append(line.rstrip("\n"))
len(genelist) #how many genes are in the list

Look up identifiers and exons from gene name using Ensembl API requesting the https://rest.ensembl.org/ server. 

In [None]:
passed_genes_list = []
json_list = []

In [None]:
server = "https://rest.ensembl.org"
for gene in genelist:
    if gene not in passed_genes_list:
        ext = ("/lookup/symbol/homo_sapiens/" + gene + "?expand=1")
        r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
        if not r.ok:
            print('unsuccesful ', gene)
            continue
 
        decoded = r.json()
        gene_ids = json.loads(repr(decoded).replace("\'", "\""))
        json_list.append(gene_ids)
        time.sleep(1)
        passed_genes_list.append(gene)
        print(gene)
    else: continue

In [None]:
#here is our list of jsons, each contains one gene

json_list
pprint.pprint(json_list[0])

In [None]:
#writing the resulting list in the file just in case the connection is lost

with open('gene_id_decoded_json_list.txt', 'w') as output:
    output.write(str(json_list))

In [None]:
#extracting exon ids from it keeping the gene ids

columns = ['Gene', 'Transcript', 'Exon']
df_data = []

for gene in json_list:
    for elm in gene['Transcript']:
        geneID = elm['Parent']
        transcriptID = elm['id']
        for exon in elm['Exon']:
            df_data.append([geneID, transcriptID, exon['id']])
            
df = pd.DataFrame(data=df_data, columns=columns)
#df

In [None]:
#saving the Gene, Transcript and Exon IDs to the file

df.to_csv('gene_trans_exon.csv', header=True, index=False)

## 2. Calling for all the variants in these regions (exons of the selected genes) existing on Ensembl

In [None]:
passed_exons_list = []
exon_variants_list = []

In [None]:
server = "https://rest.ensembl.org"

for index,row in df.iterrows(): 
    if row['Exon'] not in passed_exons_list:
        ext_exon = ("/overlap/id/" + row['Exon'] + "?feature=variation")
        geneID = row['Gene']
        transcriptID = row['Transcript']
        exonID = row['Exon']

        resp = requests.get(server+ext_exon, headers={ "Content-Type" : "application/json"})  
        if not resp.ok:
            print('failed for ' + str(index))
            continue
        var_decoded = resp.json()
        variants = pd.DataFrame(json.loads(repr(var_decoded).replace("\'", "\"")))
        variants['Gene'] = geneID #add the column with Gene ID to have it in the final df
        variants['Transcript'] = transcriptID #same for Transcript ID
        variants['Exon'] = exonID #same for Exon ID
        exon_variants_list.append(variants)
        time.sleep(1)
        print(index)
        passed_exons_list.append(row['Exon'])
    else: continue

In [None]:
#writing the resulting list in the file just in case the connection is lost

with open('exon_variants_list.txt', 'w') as output:
    output.write(str(exon_variants_list))

In [None]:
#creating the dataframe of variants

exon_variants_all_genes = pd.concat(exon_variants_list, ignore_index=True)
exon_variants_all_genes

In [None]:
#saving the unfiltered table to the file

#exon_variants_all_genes.to_csv(
#    '/Users/ksenia/Documents/MODY_genes/whole_pipeline_311011/MD_genes_exon_variants_Ens_unfiltered.csv',
#    header=True, index=False)

## 3. Filter the variants by consequence type

In [None]:
#Drop all the rows with no allele info

dbSNP = exon_variants_all_genes[
    exon_variants_all_genes['alleles'].apply(lambda x: 'HGMD_MUTATION' not in x)].reset_index(drop=True)
dbSNP

In [None]:
#saving the unfiltered table that contains variants with known allele information

dbSNP.to_csv(
    'MD_genes_exon_variants_Ens_unfiltered_dbSNP.csv',
    header=True, index=False)

Here another notebook '2_Pathogenicity_consequences_type_analysis_Ensembl' has to be run to decide for the consequence types to leave in the table

In [None]:
dbSNP['consequence_type'].unique()

In [None]:
#The list of consequence types we decided to leave in the table

consequence_type_list = ['missense_variant',
                         'protein_altering_variant',
                         'coding_sequence_variant',
                         'frameshift_variant',
                         'splice_donor_variant',
                         'splice_acceptor_variant',
                         'splice_donor_5th_base_variant',
                         'start_lost',
                         'stop_gained',
                         'stop_lost',
                         'inframe_deletion',
                         'inframe_insertion']

In [None]:
filtered_cons_type = dbSNP.query('consequence_type in @consequence_type_list').reset_index(drop=True)
filtered_cons_type

In [None]:
filtered_cons_type.to_csv(
    'MD_genes_exon_variants_Ens_filtered.csv',
    header=True, index=False)

## 4. Crealing a "Location" column for future mapping

To create a coordinate for each alternative allele in case there are more then 2 alleles, add a line for each alternative allele

In [None]:
#for each variant that has more than 2 alleles, add the row with each allele
alternative_alleles = []
for index, row in filtered_cons_type.iterrows():
    if len(row['alleles']) > 2:
        for i in range(len(row['alleles'])):
            if row['alleles'][0] != row['alleles'][i-1]:
                alternative_alleles.append({'clinical_significance': '',
                                            'seq_region_name': row['seq_region_name'],
                                            'assembly_name': row['assembly_name'],
                                            'alleles': [row['alleles'][0], row['alleles'][i-1]],
                                            'id': row['id'],
                                            'strand': row['strand'],
                                            'consequence_type': '',
                                            'feature_type': row['feature_type'],
                                            'source': row['source'],
                                            'end': row['end'],
                                            'start': row['start'],
                                            'Gene': row['Gene'],
                                            'Transcript': row['Transcript'],
                                            'Exon': row['Exon']})
alternative_alleles_df = pd.DataFrame(alternative_alleles)

In [None]:
all_alleles = pd.concat([filtered_cons_type, alternative_alleles_df]).reset_index(drop=True)
all_alleles

Open all_alleles from the file if it has already been created

In [None]:
vf_allele =[]
for el in all_alleles['alleles']:
    vf_allele.append(el[1])
all_alleles['vf_allele'] = vf_allele

Here one needs to keep in mind that allelic info for the variants with more than 2 alleles stayed intact and more lines with alternativa alleles were added. In other words, one need to reguard only [o] and [1] alleles from the 'alleles' column

Correcting the ['alleles'] column so there are just 2 alleles as the ones that are more than 2 were adeed to the end of the table 

In [None]:
two_alleles = []
for item in all_alleles['alleles']:
    two_alleles.append(item[:2]) 
all_alleles['alleles'] = two_alleles
all_alleles

In [None]:
#creating the Location column

location = []
for index, row in all_alleles.iterrows():
    location.append(str(row['seq_region_name']) + ':' + str(int(row['start'])))

In [None]:
all_alleles['Location'] = location

In [None]:
all_alleles.to_csv(
    'MD_genes_exon_variants_Ens_filtered_all_alleles_location.csv',
    header=True, index=False)

In [None]:
#all_alleles = pd.read_csv(
#    '/Users/ksenia/Documents/MODY_genes/pipeline_october2022/MD_genes_exon_variants_Ens_filtered_all_alleles_location.csv', 
#                                converters={'alleles': ast.literal_eval}, low_memory=False) 
all_alleles

In [None]:
#creating the Coordinate column

coordinates = []
for index, row in all_alleles.iterrows():
    coordinates.append(row['Location'] + ':' + row['alleles'][0] + '>' + row['vf_allele'])
all_alleles['coordinate'] = coordinates
all_alleles

In [None]:
all_alleles.to_csv(
    'Ens_filtered_all_alleles_location_coord.csv',
    header=True, index=False)

In [None]:
# To be able to drop duplicates, dropping the 'clinical_significance', 
#'clinical_significance', 'alleles' and other columns

all_alleles_no_duplicates = all_alleles[['id', 'seq_region_name', 'start', 'end', 'strand', 'vf_allele', 'Location', 
             'coordinate', 'Gene', 'Transcript', 'Exon']].drop_duplicates().reset_index(drop=True)
all_alleles_no_duplicates

In [None]:
all_alleles_no_duplicates.to_csv(
    'Ens_filtered_all_alleles_location_coord_no_duplicates.csv',
    header=True, index=False)