In [59]:
import pandas as pd
import zarr
import allel
import numpy as np
import json

# Import Data:

## Chromosome 22 Zarr Data File:

- Below the Gene data from the Chromosome 22 zarr file is imported and the unique genes across GENE1 and GENE2 are combined to create one list of unique genes.

In [30]:
zarr_path = '../website/data/FINAL_30x_GR38_NoBiIndel.zarr'
callset = zarr.open_group(zarr_path, mode='r')
callset

<zarr.hierarchy.Group '/' read-only>

In [34]:
variants = allel.VariantChunkedTable(callset['22']['variants'], 
                                     names=['GENE1', 'GENE2'])

In [93]:
# Create a dataframe from the Variant Chunked Table:
g_df = pd.DataFrame(variants['GENE1'], columns = ['GENE1'])
g_df2 = pd.DataFrame(variants['GENE2'], columns = ['GENE2'])

# Join the two dataframes:
gene_zarr_data = pd.concat([g_df, g_df2], axis=1)

In [46]:
# Get a list of the unique genes across both Gene1 and Gene2 columns:
column_values = gene_zarr_data[["GENE1", "GENE2"]].values.ravel()

unique_values =  list(pd.unique(column_values))
unique_values.remove('None')
unique_values.remove('nan')

In [47]:
len(unique_values)

816

In [50]:
uniq_genes = pd.DataFrame(unique_values, columns = ['GENE'])
uniq_genes.head()

Unnamed: 0,GENE
0,LOC105379418
1,FRG1FP
2,LOC107984037
3,LOC102723769
4,LOC107987323


In [97]:
uniq_genes[uniq_genes['GENE']== 'ABHD17AP5']

Unnamed: 0,GENE


## Aliases Data:
- Genes and aliases data was downloaded from the following source: https://www.ncbi.nlm.nih.gov/gene/

In [51]:
# Please update to include path to downloaded data:
gene_alias = pd.read_csv(path_to_gene_alias_data)

# Dropping any rows with nan values in Aliases column:
gene_alias = gene_alias.dropna(how='any',
                    subset=['Aliases']).rename(columns={"Symbol": "GENE"})

# Retaining just Gene and Aliases columns:
gene_alias = gene_alias[['GENE', 'Aliases']]

In [52]:
gene_alias.head()

Unnamed: 0,GENE,Aliases
0,A4GALT,"A14GALT1, Gb3S, P(k), P1, P1PK, PK, A4GALT"
1,ABCD1P4,ALD22Q11
2,ABHD17AP4,"FAM108A5, FAM108A5P"
3,ABHD17AP5,"FAM108A6, FAM108A6P"
5,ACO2,"ACONM, HEL-S-284, ICRD, OCA8, OPA9"


# Merge the Dataframes:

- Here we are comparing the aliases data to the genes that are present in our website data.
- We are merging the two dataframes to obtain the aliases for those genes.


In [53]:
merged_gene_data = pd.merge(uniq_genes, gene_alias,  how='left', on = ['GENE'])
merged_gene_data.head()

Unnamed: 0,GENE,Aliases
0,LOC105379418,
1,FRG1FP,
2,LOC107984037,
3,LOC102723769,
4,LOC107987323,


In [98]:
# Again dropping any rows with nan values in Aliases column:
genes_w_aliases = merged_gene_data.dropna(how='any', subset=['Aliases'])
genes_w_aliases.head()

Unnamed: 0,GENE,Aliases
7,OR11H1,"OR11H12, OR22-1"
9,POTEH,"A26C3, ACTBL1, CT104.7, POTE22"
10,POTEH-AS1,LA16c-3G11.5
11,PSLNR,LA16c-83F12.6
15,CCT8L2,CESK1


In [99]:
len(genes_w_aliases)

472

- There are 816 unique genes in our Chromosome 22 data, 472 of these have aliases as per the NCBI data.

In [100]:
g_list = list(genes_w_aliases['GENE'])
a_list = list(genes_w_aliases['Aliases'])

gene_alias_dict = dict(zip(g_list, a_list))

In [101]:
gene_alias_dict

{'OR11H1': 'OR11H12, OR22-1',
 'POTEH': 'A26C3, ACTBL1, CT104.7, POTE22',
 'POTEH-AS1': 'LA16c-3G11.5',
 'PSLNR': 'LA16c-83F12.6',
 'CCT8L2': 'CESK1',
 'TPTEP1': 'psiTPTE22',
 'ANKRD62P1-PARP4P3': 'VWFP1-ANKRD62P1-PARP4P3',
 'XKR3': 'XRG3, XTES',
 'HSFY1P1': 'CECR8, HSFYL1, HSFYP1, NCRNA00016',
 'CECR7': 'SAHL1',
 'IL17RA': 'CANDF5, CD217, CDw217, IL-17RA, IL17R, IMD51, hIL-17R',
 'TMEM121B': 'CECR6',
 'HDHD5': 'CECR5',
 'HDHD5-AS1': 'CECR4, CECR5-AS1, NCRNA00017',
 'ADA2': 'ADGF, CECR1, IDGFL, PAN, SNEDS, VAIHS',
 'SLC25A18': 'GC2',
 'ATP6V1E1': 'ARCL2C, ATP6E, ATP6E2, ATP6V1E, P31, Vma4',
 'BCL2L13': 'BCL-RAMBO, Bcl2-L-13, MIL1',
 'BID': 'FP497',
 'MIR3198-1': 'MIR3198, mir-3198-1',
 'LINC00528': 'C22orf37',
 'MICAL3': 'MICAL-3',
 'MIR648': 'MIRN648, hsa-mir-648',
 'PEX26': 'PBD7A, PBD7BM1T, Pex26pM1T, PEX26',
 'TUBA8': 'CDCBM8, TUBAL2',
 'USP18': 'ISG43, PTORCH2, UBP43',
 'FAM230D': 'LINC02592',
 'FAM230J': 'LINC01660',
 'FAM230A': 'DGCR15',
 'GGTLC3': 'GGT',
 'RIMBP3': 'RIM-BP3, RI

# Saving dictionary to json file:

In [102]:
with open('Data/gene_aliases.json', 'w') as f:
    json.dump(gene_alias_dict, f)

- A dictionary is saved so that it can be loaded and searched through in the case a user submits a query with a gene name that doesn't match the one's stored in column GENE1 or GENE2 in our data.

# Example of Code in Web-App:
- Below is an example of how this code would run within the web-app.

In [103]:
def load_json(filename):
    with open(filename) as f_in:
        return json.load(f_in)

In [105]:
def get_key(val, gene_dict):
    for key, value in gene_dict.items():
        if val in value:
            return key
        else:
            pass
        

In [None]:
# Loading dictionary:
trial_dict = load_json('Data/gene_aliases.json')

In [106]:
# Getting the key (the name in our data), when user submits an alias:
get_key('AIFL', trial_dict)

'AIFM3'

In [108]:
# Get key where gene name is not in database:
trial = get_key('ROFL', trial_dict)

In [109]:
trial == None

True