In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
import seaborn as sns
sns.set(style="white", color_codes=True)

### Import of total overlap between Helix mtDB variants and candidate pathogenic variants from MitoMap
This is the subset of the Helix mtDB bases that also have alleles in MitoMap

**NOTE** although the same base has alleles in both datasets, there may be different alleles in each dataset per base

- These entries need to be merged with the Helix mtDB (or a random sample of it) to add in candidate pathogenic mutations for model training and testing
- There are 298 unique bases that appear in both the Helix mtDB and the MitoMap databases and 560 alleles are represented in the intersection
- In the second code chunk below I eliminate most of the Helix mtDB alleles that do not appear in the MitoMap database

In [3]:
totHelMitoVars = pd.read_csv("/Users/leekapp/Desktop/CS254/Assignments/FinalProject/totHelMitoVars.csv")
totHelMitoVars.tail(20)

Unnamed: 0,base,wildtype,gene,alleles,mutation,annotation,helix_feature,MM_allele,MM_feature,homoplasmic,heteroplasmic,counts_hom,AF_hom,counts_het,AF_het,pathogenic,disease
968,15942,T,MT-TT,"[""T"",""C""]",transition,,tRNA_gene,T15942C,tRNA Thr,yes,yes,467,0.002383,7,3.6e-05,yes,Possibly LVNC-associated
969,15944,T,MT-TT,"[""T"",""C""]",transition,,tRNA_gene,T15944del,tRNA Thr,yes,yes,12,6.1e-05,2,1e-05,yes,MM
970,15948,A,MT-TT,"[""A"",""AGGACAAAT""]",in-del,,tRNA_gene,A15948G,tRNA Thr,yes,no,1,5e-06,0,0.0,yes,Reported in tic disorder patient
971,15948,A,MT-TT,"[""A"",""G""]",transition,,tRNA_gene,A15948G,tRNA Thr,yes,yes,3,1.5e-05,4,2e-05,yes,Reported in tic disorder patient
972,15950,G,MT-TT,"[""G"",""A""]",transition,,tRNA_gene,G15950A,tRNA Thr,yes,yes,11,5.6e-05,2,1e-05,yes,Dopaminergic nerve cell death (PD) / tic disorder
973,15951,A,MT-TT,"[""A"",""G""]",transition,,tRNA_gene,A15951G,tRNA Thr,yes,yes,208,0.001061,8,4.1e-05,yes,LHON / LHON modulator
974,15952,C,MT-TT,"[""C"",""CA""]",in-del,,tRNA_gene,C15952T,tRNA Thr,yes,no,2,1e-05,0,0.0,yes,Reported in tic disorder patient
975,15952,C,MT-TT,"[""C"",""T""]",transition,,tRNA_gene,C15952T,tRNA Thr,yes,yes,6,3.1e-05,2,1e-05,yes,Reported in tic disorder patient
976,15952,C,MT-TT,"[""CA"",""C""]",in-del,,tRNA_gene,C15952T,tRNA Thr,yes,no,6,3.1e-05,0,0.0,yes,Reported in tic disorder patient
977,15965,A,MT-TP,"[""A"",""G""]",transition,,tRNA_gene,A15965G,tRNA Pro,yes,no,116,0.000592,0,0.0,yes,Dopaminergic nerve cell death (PD)


### Limiting overlap dataset to unique bases
The following process limits the overlap database to Helix mtDB alleles that match the MitoMap alleles because most bases have several alleles in the Helix database but only one pathogenic allele for that base exists in MitoMap

In [5]:
# checking out how to access specific bases of alleles
print(totHelMitoVars['MM_allele'][0]) 
print(totHelMitoVars['MM_allele'][0][-1])
print(totHelMitoVars['alleles'][0])
print(totHelMitoVars['alleles'][0][6])

C-T
T
["C","A"]
A


In [6]:
# creating 'first' and 'last' columns to match Helix allele (last base) with last base of MM allele
idx = totHelMitoVars.index
first = np.full(idx.size, 'x')
for i in range(len(idx)):
    first[i] = totHelMitoVars['alleles'][i][-3]
    #first[i] = int(first[i])
first = pd.Series(first)

last = np.full(idx.size, 'x')
for i in range(len(idx)):
    last[i] = totHelMitoVars['MM_allele'][i][-1]
    #first[i] = int(first[i])
last = pd.Series(last)

In [7]:
# matching base number with alleles
nucleotides = pd.DataFrame({'base' : totHelMitoVars['base'], 'first': first, 'last': last})
nucleotides

Unnamed: 0,base,first,last
0,114,A,T
1,114,G,T
2,114,T,T
3,114,C,T
4,146,A,C
...,...,...,...
983,16002,C,C
984,16015,C,C
985,16015,C,C
986,16018,C,T


In [8]:
index = nucleotides.loc[nucleotides['first'] == nucleotides['last']] #gives rows where alleles match by last base of Helix allele
index.index #gives just the value of the row index to use to index the overlap dataframe

Int64Index([  2,   5,   8,   9,  10,  16,  17,  19,  20,  23,
            ...
            973, 975, 977, 979, 981, 982, 983, 984, 985, 987],
           dtype='int64', length=574)

### These data still have duplicates for matched MitoMap and Helix mtDB bases

In [9]:
uniqueOverlaps = totHelMitoVars.iloc[index.index,:]
uniqueOverlaps.head(10)

Unnamed: 0,base,wildtype,gene,alleles,mutation,annotation,helix_feature,MM_allele,MM_feature,homoplasmic,heteroplasmic,counts_hom,AF_hom,counts_het,AF_het,pathogenic,disease
2,114,C,MT-CRb,"[""C"",""T""]",transition,,non_coding,C-T,noncoding,yes,yes,1974,0.010072,101,0.000515,yes,BD-associated
5,146,T,MT-CRb,"[""T"",""C""]",transition,,non_coding,T-C,noncoding,yes,yes,25525,0.130241,622,0.003174,yes,Absence of Endometriosis
8,150,C,MT-CRb,"[""C"",""T""]",transition,,non_coding,C-T,noncoding,yes,yes,19751,0.100779,168,0.000857,yes,Longevity / Cervical Carcinoma / HPV infection...
9,185,G,MT-CRb,"[""G"",""A""]",transition,,non_coding,G-A,noncoding,yes,yes,10568,0.053923,317,0.001617,yes,Low VO2max response
10,185,G,MT-CRb,"[""G"",""A"",""GAA""]",in-del,,non_coding,G-A,noncoding,no,yes,0,0.0,1,5e-06,yes,Low VO2max response
16,195,T,MT-CRb,"[""T"",""A"",""C""]",in-del,,non_coding,T-C,noncoding,no,yes,0,0.0,2,1e-05,yes,BD-associated / melanoma pts
17,195,T,MT-CRb,"[""T"",""C""]",transition,,non_coding,T-C,noncoding,yes,yes,34496,0.176015,1050,0.005358,yes,BD-associated / melanoma pts
19,228,G,MT-CRb,"[""G"",""A""]",transition,,non_coding,G-A,noncoding,yes,yes,9673,0.049356,191,0.000975,yes,Low VO2max response
20,228,G,MT-CRb,"[""G"",""A"",""GACATAATAA""]",in-del,,non_coding,G-A,noncoding,no,yes,0,0.0,1,5e-06,yes,Low VO2max response
23,295,C,MT-CRb,"[""C"",""T""]",transition,,non_coding,C-T,noncoding,yes,yes,15983,0.081553,8,4.1e-05,yes,Low VO2max response


### Finding the duplicated bases

In [10]:
dups = uniqueOverlaps[uniqueOverlaps.duplicated(keep=False, subset=['base'])==True]
dups

Unnamed: 0,base,wildtype,gene,alleles,mutation,annotation,helix_feature,MM_allele,MM_feature,homoplasmic,heteroplasmic,counts_hom,AF_hom,counts_het,AF_het,pathogenic,disease
9,185,G,MT-CRb,"[""G"",""A""]",transition,,non_coding,G-A,noncoding,yes,yes,10568,0.053923,317,0.001617,yes,Low VO2max response
10,185,G,MT-CRb,"[""G"",""A"",""GAA""]",in-del,,non_coding,G-A,noncoding,no,yes,0,0.0,1,5e-06,yes,Low VO2max response
16,195,T,MT-CRb,"[""T"",""A"",""C""]",in-del,,non_coding,T-C,noncoding,no,yes,0,0.0,2,1e-05,yes,BD-associated / melanoma pts
17,195,T,MT-CRb,"[""T"",""C""]",transition,,non_coding,T-C,noncoding,yes,yes,34496,0.176015,1050,0.005358,yes,BD-associated / melanoma pts
19,228,G,MT-CRb,"[""G"",""A""]",transition,,non_coding,G-A,noncoding,yes,yes,9673,0.049356,191,0.000975,yes,Low VO2max response
20,228,G,MT-CRb,"[""G"",""A"",""GACATAATAA""]",in-del,,non_coding,G-A,noncoding,no,yes,0,0.0,1,5e-06,yes,Low VO2max response
38,576,A,MT-CRb,"[""A"",""G""]",transition,,non_coding,A-G,noncoding MT-TF precursor,yes,yes,10,5.1e-05,1,5e-06,yes,Hearing loss patient
40,3308,T,MT-ND1,"[""T"",""C""]",transition,2.0,protein_coding_gene,T-C,M1T,yes,yes,1126,0.005745,26,0.000133,yes,MELAS / DEAF enhancer / hypertension / LVNC / ...
41,3308,T,MT-ND1,"[""T"",""G""]",transversion,2.0,protein_coding_gene,T-G,M1term,yes,no,89,0.000454,0,0.0,yes,Sudden Infant Death
176,7445,A,MT-CO1,"[""A"",""C""]",transversion,3.0,protein_coding_gene,A-C,term514S,yes,no,1,5e-06,0,0.0,yes,DEAF
