## Imports

In [1]:
import allel
import zarr
import pandas as pd
import numpy as np
from collections import Counter
import h5py
from tqdm import tqdm
tqdm.pandas()

## Functions

In [2]:
def flatten(t):
    return [item for sublist in t for item in sublist]

def get_gene_coordinates(d, row):
    for key in d.keys():
        if key[0] <= row <= key[1]:
            return nono_gene_info_d[key]
            continue
        else:
            pass

def fill_gene(overlapped_idx, n_overlaps, df):
    # generating df of overlapped genes with up to 5 genes overlapped by 1 single gene
    overlap_df = unique_genes.iloc[flatten([list(range(x+1,x+n_overlaps)) for x in overlapped_idx])]
    # generating list of start end and gene name to iterate over
    range_d = list(zip(zip(overlap_df['txStart'],overlap_df['txEnd']), overlap_df.name2))
    
    for ind, row in enumerate(range_d):
        start = row[0][0]
        end = row[0][1]
        gene = row[1]
        # retrieving indexes based on start and end position of genes in gene_df
        gene_ind = df.loc[(df['POS']>=start) & (df['POS']<=end)].index

        for index in gene_ind:
            gene_df.at[index, 'GENE2'] = gene

def find_overlaps(df, num_overlaps):
    #generating a list of df values to iterate through
    zipped = list(enumerate(zip(zip(df.txStart, df.txEnd), df.name2)))
    overlaps = []

    for index, elem in zipped:
        #managing index range error when collecting overlapped gene indexes
        if (index+num_overlaps < len(zipped) and index - num_overlaps >= 0):

            curr_gene_txEnd = elem[0][1]
            next_gene_txStart = zipped[index+num_overlaps][1][0][0]
            curr_gene_name = elem[1]

            if curr_gene_txEnd > next_gene_txStart:

                overlaps.append(index)
    
    return overlaps

def df_to_zarr(df, path):
    for i in range(len(df.columns)):

        # Saving each column array to a separate zarr folder:
        col_name = df.columns[i]
        array = df[col_name].values

        # If dtype of array is object, save it as a string array instead:
        if array.dtype == 'O':
            array = array.astype('U')

        # Changing folder name according to col. name:
        save_loc = path + col_name
        zarr.save(save_loc, array)

### Loading data

In [3]:
zarr_path = '../data/ALL_30X_Chr22_GR38.zarr/'
gene_path = '../data/all_RefSeq_gene_names.gz'

callset = zarr.open_group(zarr_path)
callset.tree(expand=True)

Tree(nodes=(Node(disabled=True, name='/', nodes=(Node(disabled=True, name='22', nodes=(Node(disabled=True, nam…

In [4]:
chrom = "22"
POS = callset[chrom]['variants']['POS'][:]

In [5]:
genes = pd.read_csv(gene_path, sep = '\t')

In [6]:
# drop duplicates based on 3 columns so values remaining are unique by these 3 columns. duplicate genes still exist
# with varying txstart and txend
genes.drop_duplicates(['txStart', 'txEnd', 'name2'], inplace = True)

In [7]:
genes[genes.name2=='RABL2B']

Unnamed: 0,#name,txStart,txEnd,name2
3489,XM_017028546.1,50767499,50783705,RABL2B
3494,XM_017028544.1,50767499,50783648,RABL2B
3495,XM_017028551.1,50767499,50782683,RABL2B
3496,NM_001350016.2,50767505,50783636,RABL2B
3520,XM_017028559.2,50772189,50782683,RABL2B


* Dropping duplicates

In [8]:
#given that the txstart and txend only vary due to lack of consensus, we accept that the first occurence of a gene
# is representative of the correct txstart and end
unique_genes = genes.drop_duplicates('name2', keep='first')

### Getting overlapping gene index

* There are genes that cover up to 5 other smaller genes within its coding region

In [9]:
unique_genes.iloc[411:418]

Unnamed: 0,#name,txStart,txEnd,name2
1763,XM_011530410.3,32507819,32958822,SYN3
1777,XR_001755503.1,32578528,32587011,SYN3-AS1
1779,XR_938177.1,32587213,32589050,LOC105373003
1780,XR_001755504.1,32619341,32628543,LOC107985545
1781,XR_938176.3,32640947,32663267,LOC105373002
1782,NM_000362.5,32801704,32863041,TIMP3
1783,NR_134617.1,33108528,33116294,LINC01640


In [10]:
# Lists of genes of with both the major gene and the accompanying number of minor overlapped genes 
overlaps_1=find_overlaps(unique_genes,1)
overlaps_2=find_overlaps(unique_genes,2)
overlaps_3=find_overlaps(unique_genes,3)
overlaps_4=find_overlaps(unique_genes,4)
overlaps_5=find_overlaps(unique_genes,5)

In [11]:
print(overlaps_1[:5])
print(overlaps_2[:5])
print(overlaps_3[:5])
print(overlaps_4[:5])
print(overlaps_5[:5])

[9, 11, 13, 17, 29]
[33, 72, 97, 102, 115]
[33, 102, 237, 366, 411]
[237, 411, 418, 556, 729]
[411, 729]


In [12]:
#de-duplicating overlapped genes, duplicates exist because of function logic
dd_overlaps_1 = [x for x in overlaps_1 if x not in overlaps_2]
dd_overlaps_2 = [x for x in overlaps_2 if x not in overlaps_3]
dd_overlaps_3 = [x for x in overlaps_3 if x not in overlaps_4]
dd_overlaps_4 = [x for x in overlaps_4 if x not in overlaps_5]

In [13]:
all_overlap_indexes = [dd_overlaps_1,dd_overlaps_2,dd_overlaps_3,dd_overlaps_4,overlaps_5]

Generating empty df with type str for compatibility when assigning using .at

In [14]:
gene_df = pd.DataFrame({'POS':pd.Series(dtype='int'),
                        'GENE1': pd.Series(dtype='str'), 'GENE2': pd.Series(dtype='str')})
gene_df['POS'] = POS
gene_df.head()

Unnamed: 0,POS,GENE1,GENE2
0,10510061,,
1,10510077,,
2,10510103,,
3,10510105,,
4,10510119,,


### Filling in non-overlapping genes

In [15]:
unique_overlapping_indexes = np.unique(flatten(all_overlap_indexes))

In [16]:
overlapped_genes = gene_df.GENE2.unique()[1:]
nono_df = unique_genes[~unique_genes.name2.isin(overlapped_genes)] # non overlapping genes df
nono_df.head()

Unnamed: 0,#name,txStart,txEnd,name2
0,XR_950596.3,10742023,10753053,LOC105379418
2,NR_132320.1,10940596,10961529,FRG1FP
3,XR_001755416.2,11825296,11834829,LOC107984037
4,NR_110761.1,11897405,11956534,LOC102723769
5,XR_002958735.1,12097954,12100246,LOC107987323


In [17]:
nono_gene_info_d = dict(zip(zip(nono_df['txStart'],nono_df['txEnd']), nono_df.name2))

In [19]:
gene_df['GENE1'] = gene_df.POS.progress_map(lambda x: get_gene_coordinates(d=nono_gene_info_d,row=x))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1927372/1927372 [01:23<00:00, 22973.13it/s]


### Filling in overlapping genes

In [20]:
#filling in GENE2 by using overlapping indexes
for val, ind in tqdm(zip(all_overlap_indexes, [2,3,4,5,6])):
    fill_gene(val, ind, df=gene_df)

5it [00:01,  2.62it/s]


In [21]:
gene_df[(gene_df.POS >= 32578528) & (gene_df.POS <= 32587011)]

Unnamed: 0,POS,GENE1,GENE2
1051101,32578530,SYN3,SYN3-AS1
1051102,32578534,SYN3,SYN3-AS1
1051103,32578570,SYN3,SYN3-AS1
1051104,32578588,SYN3,SYN3-AS1
1051105,32578670,SYN3,SYN3-AS1
...,...,...,...
1051510,32586887,SYN3,SYN3-AS1
1051511,32586947,SYN3,SYN3-AS1
1051512,32586973,SYN3,SYN3-AS1
1051513,32586988,SYN3,SYN3-AS1


In [22]:
gene_df[~gene_df.GENE2.isnull()]

Unnamed: 0,POS,GENE1,GENE2
249491,15700402,POTEH,POTEH-AS1
249492,15700435,POTEH,POTEH-AS1
249493,15700446,POTEH,POTEH-AS1
249494,15700505,POTEH,POTEH-AS1
249495,15700512,POTEH,POTEH-AS1
...,...,...,...
1925568,50783645,RPL23AP82,RABL2B
1925569,50783652,RPL23AP82,RABL2B
1925570,50783654,RPL23AP82,RABL2B
1925571,50783672,RPL23AP82,RABL2B


In [23]:
final_data = gene_df.iloc[:,1:]
df_to_zarr(final_data, '../data/GENE.zarr/')

In [24]:
final_data.count()

GENE1    974965
GENE2    148615
dtype: int64