## Annotating the variants of genes of the patient from the gencode database

In [1]:
gene_annotations_filename = "gencode.v18.annotation.gtf"
variant_filename = 'Metabolic_variants.vcf'

In [2]:
from gtfparse import read_gtf
# returns GTF with essential columns such as "feature", "seqname", "start", "end"
# alongside the names of any optional keys which appeared in the attribute column
df = read_gtf(gene_annotations_filename)

INFO:root:Extracted GTF attributes: ['gene_id', 'transcript_id', 'gene_type', 'gene_status', 'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'level', 'havana_gene', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'ccdsid']


In [3]:
# filter DataFrame to gene entries on chrY
df_genes = df[df["feature"] == "gene"]
df_genes.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,transcript_id,...,transcript_status,transcript_name,level,havana_gene,tag,havana_transcript,exon_number,exon_id,ont,ccdsid
0,chr1,HAVANA,gene,11869,14412,,+,0,ENSG00000223972.4,ENSG00000223972.4,...,KNOWN,DDX11L1,2,OTTHUMG00000000961.2,,,,,,
21,chr1,HAVANA,gene,14363,29806,,-,0,ENSG00000227232.4,ENSG00000227232.4,...,KNOWN,WASH7P,2,OTTHUMG00000000958.1,,,,,,
82,chr1,HAVANA,gene,29554,31109,,+,0,ENSG00000243485.2,ENSG00000243485.2,...,NOVEL,MIR1302-11,2,OTTHUMG00000000959.2,,,,,,
92,chr1,HAVANA,gene,34554,36081,,-,0,ENSG00000237613.2,ENSG00000237613.2,...,KNOWN,FAM138A,2,OTTHUMG00000000960.1,,,,,,
100,chr1,HAVANA,gene,52473,54936,,+,0,ENSG00000268020.2,ENSG00000268020.2,...,KNOWN,OR4G4P,2,OTTHUMG00000185779.1,,,,,,


In [4]:
df_genes.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57445 entries, 0 to 2614559
Data columns (total 24 columns):
seqname              57445 non-null object
source               57445 non-null object
feature              57445 non-null object
start                57445 non-null int64
end                  57445 non-null int64
score                0 non-null float32
strand               57445 non-null object
frame                57445 non-null object
gene_id              57445 non-null object
transcript_id        57445 non-null object
gene_type            57445 non-null object
gene_status          57445 non-null object
gene_name            57445 non-null object
transcript_type      57445 non-null object
transcript_status    57445 non-null object
transcript_name      57445 non-null object
level                57445 non-null object
havana_gene          57445 non-null object
tag                  57445 non-null object
havana_transcript    57445 non-null object
exon_number          57445 non-nul

In [5]:
df_genes.columns

Index(['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand',
       'frame', 'gene_id', 'transcript_id', 'gene_type', 'gene_status',
       'gene_name', 'transcript_type', 'transcript_status', 'transcript_name',
       'level', 'havana_gene', 'tag', 'havana_transcript', 'exon_number',
       'exon_id', 'ont', 'ccdsid'],
      dtype='object')

In [6]:
df_genes['seqname'].unique()

array(['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8',
       'chr9', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15',
       'chr16', 'chr17', 'chr18', 'chr19', 'chr20', 'chr21', 'chr22',
       'chrX', 'chrY', 'chrM'], dtype=object)

## Look into the list of variants and find correspondance to the gene from the gencode file

In [7]:
import vcf
import pandas as pd

vcf_reader = vcf.Reader(open(variant_filename, 'r'))
variants = pd.DataFrame(columns=['CHROM', 'start', 'end', 'strand', 'gene_id', 'transcript_id', 'gene_type', 'gene_status',
       'gene_name', 'transcript_type', 'transcript_status', 'transcript_name', 'POS','REF','ALT','FILTER'])

#CHROM	POS	ID	REF	ALT	QUAL	FILTER

In [None]:
counter = 0
for record in vcf_reader:
    chrom = record.CHROM
    pos = record.POS
    ref = record.REF
    alt = record.ALT
    filter_pass = record.FILTER
    for row in df_genes.itertuples():
        #print(str(chrom), row.seqname[3:])
        there_is_entry = str(chrom) == row.seqname[3:] and pos > row.start and pos < row.end
        if there_is_entry:
            new_row = {'POS': pos, 'start': row.start, 'end': row.end, 'strand': row.strand, 'gene_id' : row.gene_id, 'transcript_id' : row.transcript_id, 'gene_type' : row.gene_type, 'gene_status': row.gene_status, 'gene_name' : row.gene_name, 'transcript_type' : row.transcript_type, 'transcript_status': row.transcript_status, 'transcript_name' : row.transcript_name, 'CHROM' : chrom, 'POS': pos,'REF':ref,'ALT':alt,'FILTER':filter_pass}
            variants = variants.append(new_row, ignore_index=True)
            counter += 1
            #print(counter)
            if counter % 100 == 0:
                print(variants.tail())

In [8]:
gene_disease = pd.read_table('curated_gene_disease_associations.tsv')

In [10]:
gene_disease.tail()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
81741,109580095,HBB-LCR,0.815,0.069,C0002875,Cooley's anemia,disease,C15;C16,Disease or Syndrome,0.3,,,,0,0,CTD_human
81742,109580095,HBB-LCR,0.815,0.069,C0005283,beta Thalassemia,disease,C15;C16,Disease or Syndrome,0.3,,,,0,0,CTD_human
81743,109580095,HBB-LCR,0.815,0.069,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,,,0,0,CTD_human
81744,109580095,HBB-LCR,0.815,0.069,C0085578,Thalassemia Minor,disease,C15;C16,Disease or Syndrome,0.3,,,,0,0,CTD_human
81745,109580095,HBB-LCR,0.815,0.069,C0271979,Thalassemia Intermedia,disease,C15;C16,Disease or Syndrome,0.3,,,,0,0,CTD_human


In [30]:
gene_disease[gene_disease.diseaseId == 'C0019025']

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
6822,790,CAD,0.557,0.655,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,1979.0,1979.0,1,0,CTD_human
14149,1723,DHODH,0.621,0.621,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,1979.0,1979.0,1,0,CTD_human
16472,2056,EPO,0.425,0.759,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,2006.0,2006.0,1,0,CTD_human
23329,3040,HBA2,0.524,0.724,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.31,1.0,1982.0,2007.0,1,0,CTD_human
23351,3043,HBB,0.513,0.793,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.6,1.0,1979.0,2017.0,6,0,CTD_human;ORPHANET
23383,3047,HBG1,0.639,0.448,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.4,1.0,1982.0,2010.0,0,0,ORPHANET
23390,3048,HBG2,0.636,0.552,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,,,0,0,ORPHANET
30758,3934,LCN2,0.469,0.793,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,2006.0,2006.0,1,0,CTD_human
51059,7036,TFR2,0.594,0.517,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,2006.0,2006.0,1,0,CTD_human
51080,7037,TFRC,0.471,0.759,C0019025,Hemoglobin F Disease,disease,C15;C16,Disease or Syndrome,0.3,,2006.0,2006.0,1,0,CTD_human


In [20]:
len(gene_disease.geneId.unique()), len(gene_disease.diseaseId.unique())

(9413, 10370)

## Creating a table of associated diseases for every  mutated gene of the patient 

In [21]:
metabolic_filename = 'shortened_metabolic.csv'
pediatric_filename = 'shortened_Pediatric.csv'
carrier_filename = 'shortened_Carrier.csv'
cardioneuro_filename = 'shortened_CardioNeuro.csv'

exon_filename = 'shortened_Exon.csv'


metabolic_var_genes = pd.read_csv(metabolic_filename)
pediatric_var_genes = pd.read_csv(pediatric_filename)
carrier_var_genes = pd.read_csv(carrier_filename)
cardioneuro_var_genes = pd.read_csv(cardioneuro_filename)

metabolic_diseases = pd.DataFrame()
pediatric_diseases = pd.DataFrame()
carrier_diseases = pd.DataFrame()
cardioneuro_diseases = pd.DataFrame()

for variant in metabolic_var_genes.itertuples():
    metabolic_diseases = metabolic_diseases.append(gene_disease[gene_disease.geneSymbol == variant.gene_name])

for variant in pediatric_var_genes.itertuples():
    pediatric_diseases = pediatric_diseases.append(gene_disease[gene_disease.geneSymbol == variant.gene_name])

for variant in carrier_var_genes.itertuples():
    carrier_diseases = carrier_diseases.append(gene_disease[gene_disease.geneSymbol == variant.gene_name])

for variant in cardioneuro_var_genes.itertuples():
    cardioneuro_diseases = cardioneuro_diseases.append(gene_disease[gene_disease.geneSymbol == variant.gene_name])

    

In [24]:
metabolic_diseases.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
49215,6697,SPR,0.667,0.379,C0005586,Bipolar Disorder,disease,F03,Mental or Behavioral Dysfunction,0.31,1.0,2009.0,2009.0,1,0,PSYGENET
49216,6697,SPR,0.667,0.379,C0268468,"Dystonia, Dopa-Responsive, due to Sepiapterin ...",disease,C10;C16;C18;C23;F01,Disease or Syndrome,0.71,0.0,1993.0,2015.0,4,7,CTD_human;ORPHANET;UNIPROT
49217,6697,SPR,0.667,0.379,C0525045,Mood Disorders,group,F03,Mental or Behavioral Dysfunction,0.31,1.0,2009.0,2009.0,1,0,PSYGENET
57930,8869,ST3GAL5,0.676,0.448,C0282577,Congenital Disorders of Glycosylation,group,C16;C18,Disease or Syndrome,0.3,,2004.0,2014.0,2,0,GENOMICS_ENGLAND
57931,8869,ST3GAL5,0.676,0.448,C0543888,Epileptic encephalopathy,disease,,Disease or Syndrome,0.3,,,,0,0,GENOMICS_ENGLAND


In [25]:
metabolic_diseases.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 616 entries, 49215 to 50434
Data columns (total 16 columns):
geneId                 616 non-null int64
geneSymbol             616 non-null object
DSI                    616 non-null float64
DPI                    616 non-null float64
diseaseId              616 non-null object
diseaseName            616 non-null object
diseaseType            616 non-null object
diseaseClass           554 non-null object
diseaseSemanticType    616 non-null object
score                  616 non-null float64
EI                     183 non-null float64
YearInitial            532 non-null float64
YearFinal              532 non-null float64
NofPmids               616 non-null int64
NofSnps                616 non-null int64
source                 616 non-null object
dtypes: float64(6), int64(3), object(7)
memory usage: 81.8+ KB


In [27]:
pediatric_diseases.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
75346,90529,STPG1,1.0,0.069,C0006142,Malignant neoplasm of breast,disease,C04;C17,Neoplastic Process,0.3,,,,0,0,UNIPROT
73351,79753,SNIP1,0.685,0.345,C3281055,"PSYCHOMOTOR RETARDATION, EPILEPSY, AND CRANIOF...",disease,,Disease or Syndrome,0.4,,2012.0,2012.0,1,1,UNIPROT
73352,79753,SNIP1,0.685,0.345,C3714756,Intellectual Disability,group,C10;C23;F01;F03,Mental or Behavioral Dysfunction,0.3,,2012.0,2012.0,1,0,GENOMICS_ENGLAND
44983,6121,RPE65,0.479,0.759,C0015398,"Eye Diseases, Hereditary",group,C11;C16,Disease or Syndrome,0.3,,2007.0,2007.0,1,0,CTD_human
44984,6121,RPE65,0.479,0.759,C0035304,Retinal Degeneration,phenotype,C11,Pathologic Function,0.4,,2014.0,2015.0,1,1,CTD_human


In [28]:
carrier_diseases.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
75931,114034,TOE1,0.701,0.414,C0036875,Disorders of Sex Development,group,C12;C13;C16;C19,Congenital Abnormality,0.3,,2017.0,2017.0,1,0,GENOMICS_ENGLAND
75932,114034,TOE1,0.701,0.414,C0266468,Congenital pontocerebellar hypoplasia,disease,C10,Congenital Abnormality,0.31,1.0,2017.0,2017.0,1,0,CTD_human
75933,114034,TOE1,0.701,0.414,C0266470,Cerebellar Hypoplasia,disease,C10;C16;F03,Congenital Abnormality,0.4,,2017.0,2017.0,1,0,GENOMICS_ENGLAND
75934,114034,TOE1,0.701,0.414,C3554226,Congenital pontocerebellar hypoplasia type 7,disease,,Disease or Syndrome,0.7,,2017.0,2017.0,1,8,CTD_human;ORPHANET;UNIPROT
75935,114034,TOE1,0.701,0.414,C3714756,Intellectual Disability,group,C10;C23;F01;F03,Mental or Behavioral Dysfunction,0.3,,2017.0,2017.0,1,0,GENOMICS_ENGLAND


In [29]:
cardioneuro_diseases.head()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
24581,3208,HPCA,0.735,0.345,C1857093,"DYSTONIA 2, TORSION, AUTOSOMAL RECESSIVE (diso...",disease,C10;C16,Disease or Syndrome,0.7,,1981.0,2017.0,2,3,CTD_human;ORPHANET;UNIPROT
35781,4803,NGF,0.426,0.862,C0002622,Amnesia,disease,C10;C23;F01;F03,Mental or Behavioral Dysfunction,0.31,1.0,1995.0,2009.0,2,0,CTD_human
35782,4803,NGF,0.426,0.862,C0002768,Congenital Pain Insensitivity,disease,C10;C16,Disease or Syndrome,0.32,1.0,2009.0,2013.0,0,0,ORPHANET
35783,4803,NGF,0.426,0.862,C0007959,Charcot-Marie-Tooth Disease,disease,C10;C16,Disease or Syndrome,0.3,,,,0,0,GENOMICS_ENGLAND
35784,4803,NGF,0.426,0.862,C0009171,Cocaine Abuse,disease,C25;F03,Mental or Behavioral Dysfunction,0.3,,2008.0,2008.0,1,0,CTD_human


In [31]:
all_diseases = pd.DataFrame()
all_diseases = all_diseases.append(metabolic_diseases)
all_diseases = all_diseases.append(pediatric_diseases)
all_diseases = all_diseases.append(carrier_diseases)
all_diseases = all_diseases.append(cardioneuro_diseases)

In [32]:
all_diseases.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3494 entries, 49215 to 50434
Data columns (total 16 columns):
geneId                 3494 non-null int64
geneSymbol             3494 non-null object
DSI                    3491 non-null float64
DPI                    3491 non-null float64
diseaseId              3494 non-null object
diseaseName            3494 non-null object
diseaseType            3494 non-null object
diseaseClass           3196 non-null object
diseaseSemanticType    3494 non-null object
score                  3494 non-null float64
EI                     1044 non-null float64
YearInitial            3083 non-null float64
YearFinal              3083 non-null float64
NofPmids               3494 non-null int64
NofSnps                3494 non-null int64
source                 3494 non-null object
dtypes: float64(6), int64(3), object(7)
memory usage: 464.0+ KB


In [35]:
all_diseases.to_csv('diseases_patient.csv')

In [36]:
gene_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81746 entries, 0 to 81745
Data columns (total 16 columns):
geneId                 81746 non-null int64
geneSymbol             81746 non-null object
DSI                    81620 non-null float64
DPI                    81598 non-null float64
diseaseId              81746 non-null object
diseaseName            81746 non-null object
diseaseType            81746 non-null object
diseaseClass           75443 non-null object
diseaseSemanticType    81746 non-null object
score                  81746 non-null float64
EI                     22862 non-null float64
YearInitial            73123 non-null float64
YearFinal              73123 non-null float64
NofPmids               81746 non-null int64
NofSnps                81746 non-null int64
source                 81746 non-null object
dtypes: float64(6), int64(3), object(7)
memory usage: 10.0+ MB


In [38]:
unique_genes = gene_disease.geneSymbol.unique()
unique_diseases = gene_disease.diseaseId.unique()


In [45]:
import numpy as np

In [48]:
data = pd.DataFrame(np.zeros(( len(unique_diseases), len(unique_genes))), columns=unique_genes, index=unique_diseases)

In [49]:
for row in gene_disease.itertuples():
    gene = row.geneSymbol
    disease = row.diseaseId
    score = row.score
    data[gene].loc[disease] = score
 

In [51]:
data.head()

Unnamed: 0,A1BG,A2M,NAT1,NAT2,SERPINA3,AADAC,AANAT,AARS,ABAT,ABCA1,...,MIR4433B,MIR7977,MIR1273H,MIR8061,MIR6741,MIR6803,SCA37,H19-ICR,DHS6S1,HBB-LCR
C0019209,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0036341,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0002395,0.0,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0007102,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0009375,0.0,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
data.to_csv('matrixA.csv')

In [63]:
g = pd.DataFrame(np.zeros((1, len(unique_genes))), columns=unique_genes, index=[0])

In [64]:
for row in all_diseases.itertuples():
    gene = row.geneSymbol
    g[gene].loc[0] = 1.
    print(gene)

SPR
SPR
SPR
ST3GAL5
ST3GAL5
ST3GAL5
ST3GAL5
CD8A
CD8A
CD8A
CD8A
CD8A
DARS
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACVR1
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
ACKR3
OXSM
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
TREX1
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
GMPPB
HYAL1
HYAL1
HYAL1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
TERC
DNAJC19
DNAJC19
DNAJC19
DNAJC19
DNAJC19
DNAJC19
DNAJC19
EIF2B5
EIF2B5
EIF2B5
EIF2B5
EIF2B5
EIF2B5
TLR1
TLR1
TLR1
TLR1
TLR1
TLR1
TLR1
PPM1K
PPM1K
PPM1K
PPM1K
PPM1K
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
PDE4D
PDE4D
PDE4D
PDE4D

BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
BAP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
FOXP1
EIF2B5
EIF2B5
EIF2B5
EIF2B5
EIF2B5
EIF2B5
TLR1
TLR1
TLR1
TLR1
TLR1
TLR1
TLR1
BBS7
BBS7
BBS7
BBS7
BBS7
BBS7
BBS7
BBS7
MFSD8
MFSD8
MFSD8
MFSD8
MFSD8
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
SLC1A3
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
PDE4D
SPOCK1
SPOCK1
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
SLC26A2
TNX

RUNX1
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
COMT
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LIF
LAS1L
LAS1L
LAS1L
LAS1L
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
GJB1
SLC16A2
SLC16A2
SLC16A2
SLC16A2
SLC16A2
SLC16A2
SLC16A2
SLC16A2
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
ZIC3
VMA21
VMA21
VMA21
TAZ
TAZ
TAZ
TAZ
TAZ
TAZ
TAZ
TAZ
TAZ
TAZ
TAZ


In [67]:
np.mean(g.loc[0])

0.018170226330889386

In [105]:
G = np.array(g.loc[0])
A = data.values

In [106]:
d = np.dot(A, G)

In [108]:
new_d = pd.DataFrame(columns=unique_diseases)

In [110]:
new_d.loc['a'] = d

In [111]:
new_d

Unnamed: 0,C0019209,C0036341,C0002395,C0007102,C0009375,C0011265,C0011570,C0011581,C0019202,C0022660,...,C1859452,C4319565,C2676281,C2936793,C3888208,C3150680,C2931662,C3151343,C3281236,C0730294
a,0.3,5.09,1.14,0.3,0.3,1.07,2.09,2.4,0.32,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
