In [59]:
import pandas as pd
import zarr
import allel
import numpy as np
import json

# Import Data:

## Chromosome 22 Zarr Data File:

- Below the Gene data from the Chromosome 22 zarr file is imported and the unique genes across GENE1 and GENE2 are combined to create one list of unique genes.

In [30]:
zarr_path = 'Data/FINAL_30x_GR38_NoBiIndel.zarr'
callset = zarr.open_group(zarr_path, mode='r')
callset

<zarr.hierarchy.Group '/' read-only>

In [34]:
variants = allel.VariantChunkedTable(callset['22']['variants'], 
                                     names=['GENE1', 'GENE2'])

In [35]:
# Create a dataframe from the Variant Chunked Table:
g_df = pd.DataFrame(variants['GENE1'], columns = ['GENE1'])
g_df2 = pd.DataFrame(variants['GENE2'], columns = ['GENE2'])

# Join the two dataframes:
gene_zarr_data = pd.concat([g_df, g_df2], axis=1)

In [46]:
# Get a list of the unique genes across both Gene1 and Gene2 columns:
column_values = gene_zarr_data[["GENE1", "GENE2"]].values.ravel()

unique_values =  list(pd.unique(column_values))
unique_values.remove('None')
unique_values.remove('nan')

In [47]:
len(unique_values)

816

In [50]:
uniq_genes = pd.DataFrame(unique_values, columns = ['GENE'])
uniq_genes.head()

Unnamed: 0,GENE
0,LOC105379418
1,FRG1FP
2,LOC107984037
3,LOC102723769
4,LOC107987323


## Aliases Data:
- Genes and aliases data was downloaded from the following source: https://www.ncbi.nlm.nih.gov/gene/

In [51]:
gene_alias = pd.read_csv('Data/gene_alias.csv')

# Dropping any rows with nan values in Aliases column:
gene_alias = gene_alias.dropna(how='any',
                    subset=['Aliases']).rename(columns={"Symbol": "GENE"})

# Retaining just Gene and Aliases columns:
gene_alias = gene_alias[['GENE', 'Aliases']]

In [52]:
gene_alias.head()

Unnamed: 0,GENE,Aliases
0,A4GALT,"A14GALT1, Gb3S, P(k), P1, P1PK, PK, A4GALT"
1,ABCD1P4,ALD22Q11
2,ABHD17AP4,"FAM108A5, FAM108A5P"
3,ABHD17AP5,"FAM108A6, FAM108A6P"
5,ACO2,"ACONM, HEL-S-284, ICRD, OCA8, OPA9"


# Merge the Dataframes:

- Here we are comparing the aliases data to the genes that are present in our website data.
- We are merging the two dataframes to obtain the aliases for those genes.


In [53]:
merged_gene_data = pd.merge(uniq_genes, gene_alias,  how='left', on = ['GENE'])
merged_gene_data.head()

Unnamed: 0,GENE,Aliases
0,LOC105379418,
1,FRG1FP,
2,LOC107984037,
3,LOC102723769,
4,LOC107987323,


In [55]:
# Again dropping any rows with nan values in Aliases column:
genes_w_aliases = gene_alias.dropna(how='any', subset=['Aliases'])
genes_w_aliases.head()

Unnamed: 0,GENE,Aliases
0,A4GALT,"A14GALT1, Gb3S, P(k), P1, P1PK, PK, A4GALT"
1,ABCD1P4,ALD22Q11
2,ABHD17AP4,"FAM108A5, FAM108A5P"
3,ABHD17AP5,"FAM108A6, FAM108A6P"
5,ACO2,"ACONM, HEL-S-284, ICRD, OCA8, OPA9"


In [56]:
len(genes_w_aliases)

727

- There are 816 unique genes in our Chromosome 22 data, 727 of these have aliases as per the NCBI data.

In [57]:
g_list = list(genes_w_aliases['GENE'])
a_list = list(genes_w_aliases['Aliases'])

gene_alias_dict = dict(zip(g_list, a_list))

In [64]:
gene_alias_dict

{'A4GALT': 'A14GALT1, Gb3S, P(k), P1, P1PK, PK, A4GALT',
 'ABCD1P4': 'ALD22Q11',
 'ABHD17AP4': 'FAM108A5, FAM108A5P',
 'ABHD17AP5': 'FAM108A6, FAM108A6P',
 'ACO2': 'ACONM, HEL-S-284, ICRD, OCA8, OPA9',
 'ADA2': 'ADGF, CECR1, IDGFL, PAN, SNEDS, VAIHS',
 'ADM2': 'AM2, dJ579N16.4',
 'ADORA2A': 'A2aR, ADORA2, RDC8',
 'ADORA2A-AS1': 'C22orf45',
 'ADSL': 'AMPS, ASASE, ASL',
 'AGTRL2': 'ATR2L1',
 'AIFM3': 'AIFL',
 'ALG12': 'CDG1G, ECM39, PP14673, hALG12',
 'ANKRD54': 'LIAR',
 'ANKRD62P1-PARP4P3': 'VWFP1-ANKRD62P1-PARP4P3',
 'AP1B1': 'ADTB1, AP105A, BAM22, CLAPB2, KIDAR',
 'AP1B1P1': 'ADTB1L1, dJ127L4.2',
 'AP1B1P2': 'ADTB1L2, dJ127L4.3',
 'APOBEC3A': 'A3A, ARP3, PHRBN, bK150C2.1',
 'APOBEC3A_B': 'A3A, APOBEC3A',
 'APOBEC3B': 'A3B, APOBEC1L, ARCD3, ARP4, DJ742C19.2, PHRBNL, bK150C2.2',
 'APOBEC3C': 'A3C, APOBEC1L, ARDC2, ARDC4, ARP5, PBI, bK150C2.3',
 'APOBEC3D': 'A3D, A3DEE, APOBEC3E, ARP6, APOBEC3D',
 'APOBEC3F': 'A3F, ARP8, BK150C2.4.MRNA, KA6',
 'APOBEC3G': 'A3G, ARCD, ARP-9, ARP9, CEM-15,

# Saving dictionary to json file:

In [62]:
with open('Data/gene_aliases.json', 'w') as f:
    json.dump(gene_alias_dict, f)

- A dictionary is saved so that it can be loaded and searched through in the case a user submits a query with a gene name that doesn't match the one's stored in column GENE1 or GENE2 in our data.

# Example of Code in Web-App:
- Below is an example of how this code would run within the web-app.

In [66]:
def load_json(filename):
    with open(filename) as f_in:
        return json.load(f_in)

In [67]:
trial_dict = load_json('Data/gene_aliases.json')

In [78]:
def get_key(val, gene_dict):
    for key, value in gene_dict.items():
        if val in value:
            return key
        else:
            pass
        

In [80]:
get_key('AIFL', trial_dict)

'AIFM3'

In [84]:
get_key('PP14673', trial_dict)

'ALG12'