# Gene matrix recipe

### 0. Preparation

In [1]:
# Import statements
import os
import yaml
import numpy as np
import pandas as pd

from collections import defaultdict

In [4]:
# Path settings
project_dir = os.path.dirname(os.path.abspath('.'))
conf_dir = os.path.join(project_dir, 'conf')
path_conf_path = os.path.join(conf_dir, 'filepaths.yaml')
gene_type_conf_path = os.path.join(conf_dir, 'gene_biotype.yaml')

with open(path_conf_path) as path_conf_file:
    path_conf = yaml.safe_load(path_conf_file)

gencode_path = os.path.join(project_dir, path_conf['GENCODE'])
prev_gene_mat_path = os.path.join(project_dir, path_conf['An2018'])
alt_gene_list_path = os.path.join(project_dir, path_conf['An2018_ALTGENE'])

In [5]:
# Load the data
gencode_df = pd.read_table(gencode_path, compression='gzip', comment='#', names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])  
gencode_df = gencode_df[gencode_df['feature'] == 'gene']  # List up only genes
prev_gene_mat_df = pd.read_excel(prev_gene_mat_path, sheet_name='8-1 Genesets')

In [6]:
# Function to parse strings of the 'attribute' field in the GENCODE.
def parse_attr_str(attr_str):
    attrs = attr_str.split(';')
    attr_dict = {}
    
    for attr in attrs:
        key, value = attr.split('=')
        attr_dict[key] = value
        
    return attr_dict

### 1. Make a list of alternative gene names and IDs 
The purpose of this step is to find deprecated genes from the *8-1 Genesets* sheet of the **Supplementary Table S8** in An *et al.*, *Science*, 2018 (This is the **previous gene matrix**).
The reference of the gene list is GENCODE v33.

*Note: If you alreday have the file of {alt_gene_list_path}, skip this step.*

In [23]:
# Parse values of the 'attribute' field in the GENCODE and make a set of gene IDs
gene_to_attr_dict = {}  # Key: GeneID, Value: A dictionary for the information in the 'attribute' columns

for attr_str in gencode_df['attribute'].values:
    attr_dict = parse_attr_str(attr_str)
    
    if not attr_dict['ID'].endswith('Y'):  # Ignore pseudoautosome region (PAR_Y)
        gene_id = attr_dict['ID'].split('.')[0]
        gene_to_attr_dict[gene_id] = attr_dict

gene_id_set = set(gene_to_attr_dict.keys())

# Calculate No. of not available genes in the previous gene matrix
prev_gene_ids = np.vectorize(lambda x: x.split('.')[0])(prev_gene_mat_df['EnsemblGeneId'].values)
is_na_gene = np.vectorize(lambda x: x not in gene_id_set)(prev_gene_ids)
print(f'No. not available genes: {sum(is_na_gene)}')
na_gene_df = prev_gene_mat_df[is_na_gene]
na_gene_df.sum(0)

No. not available genes: 470


Genes                             AC000041.2AC002310.4AC002384.2AC002465.1AC0024...
EnsemblGeneId                     ENSG00000274457.1ENSG00000260869.1ENSG00000283...
ASD associated genes (FDR≤0.1)                                                    0
ASD associated genes (FDR≤0.3)                                                    0
Midfetal co-expression genes                                                      0
Brain expressed genes                                                             0
Constrained genes (pLI ≥ 0.9)                                                     0
Postsynaptic density genes                                                        0
Developmental delay genes                                                         0
CHD8 targets                                                                      0
FMRP targets                                                                      0
Protein Coding                                                              

In [10]:
# The list of not available genes with their alternative names and IDs
# The genes in the gene lists except GENCODE biotypes and results of An et al., 2018 were considered.
# Columns: gene, gene_id, alt_gene, alt_gene_id
# Ref: https://www.genecards.org, http://asia.ensembl.org
na_gene_entries = [
    ['AKAP2', 'ENSG00000241978.9', 'PALM2AKAP2', 'ENSG00000157654.19'],
    ['BTBD8', 'ENSG00000284413.2', 'BTBD8', 'ENSG00000189195.13'],
    ['C2orf48', 'ENSG00000163009.8', 'RRM2', 'ENSG00000171848.15'],
    ['C3orf36', 'ENSG00000221972.3', 'SLCO2A1', 'ENSG00000174640.15'],
    ['C8orf44', 'ENSG00000213865.7', 'SGK3', 'ENSG00000104205.15'],
    ['C9orf47', 'ENSG00000186354.10', 'S1PR3', 'ENSG00000213694.5'],
    ['HIST1H3B', 'ENSG00000274267.1', 'H3C2', 'ENSG00000286522.1'],
    ['HIST1H3C', 'ENSG00000278272.1', 'H3C3', 'ENSG00000287080.1'],
    ['SCO2', 'ENSG00000130489.14', 'SCO2', 'ENSG00000284194.2'],
    ['TBCE', 'ENSG00000116957.12', 'TBCE', 'ENSG00000284770.2'],
    ['TMEM133', 'ENSG00000170647.3', 'ARHGAP42', 'ENSG00000165895.19'],
    ['PAML2', 'ENSG00000243444.7', 'PALM2AKAP2', 'ENSG00000157654.19'], 
]

# Write the entries
with open(alt_gene_list_path, 'w') as alt_gene_list_file:
    print('gene', 'gene_id', 'alt_gene', 'alt_gene_id', sep='\t', file=alt_gene_list_file)
    
    for gene_entry in na_gene_entries:
        print(*gene_entry, sep='\t', file=alt_gene_list_file)

### 2. Make the previous gene matrix compatible with GENCODE v33

In [21]:
# Load the list of n/a genes in the previous gene matrix 
alt_gene_list_df = pd.read_table(alt_gene_list_path)

# Change the previous gene names and IDs into their alternatives
alt_gene_dict = {}  # Key: prev_gene_id, Value: (alt_gene_name, alt_gene_id)

for _, gene_entry in alt_gene_list_df.iterrows():
    alt_gene_dict[gene_entry['gene_id']] = (gene_entry['alt_gene'], gene_entry['alt_gene_id'])
    
gene_names = prev_gene_mat_df['Genes'].values
gene_ids = prev_gene_mat_df['EnsemblGeneId'].values

for i in range(len(gene_names)):
    prev_gene_name = gene_names[i]
    prev_gene_id = gene_ids[i]
    alt_gene = alt_gene_dict.get(prev_gene_id)
    
    if alt_gene is not None:
        alt_gene_name, alt_gene_id = alt_gene
        gene_names[i] = alt_gene_name
        gene_ids[i] = alt_gene_id

# Remove genes in pseudoautosome regions (PAR_Y)
prev_gene_mat_df = prev_gene_mat_df.loc[np.vectorize(lambda x: not x.endswith('_PAR_Y'))(prev_gene_mat_df['EnsemblGeneId'].values)]

# Trim the redundant part of each ensemble gene ID and add them into the previous gene matrix
gene_ids = np.vectorize(lambda x: x.split('.')[0])(prev_gene_mat_df['EnsemblGeneId'].values)
prev_gene_mat_df['GeneID'] = gene_ids
prev_gene_mat_df.head()

Unnamed: 0,Genes,EnsemblGeneId,ASD associated genes (FDR≤0.1),ASD associated genes (FDR≤0.3),Midfetal co-expression genes,Brain expressed genes,Constrained genes (pLI ≥ 0.9),Postsynaptic density genes,Developmental delay genes,CHD8 targets,...,Protein Coding,mis3_pro,mis3_sib,Prom_pro,Prom_sib,Prom_ActiveTSS_pro,Prom_ActiveTSS_sib,Prom_ConservedLoci_pro,Prom_ConservedLoci_sib,GeneID
0,5_8S_rRNA,ENSG00000275877.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000275877
1,5S_rRNA,ENSG00000201285.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000201285
2,7SK,ENSG00000202198.1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000202198
3,A1BG,ENSG00000121410.11,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,1,0,0,ENSG00000121410
4,A1BG-AS1,ENSG00000268895.5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ENSG00000268895


In [22]:
# A dictionary to convert names of the columns
gene_list_name_dict = {
    'ASD associated genes (FDR≤0.1)': 'ASD_TADA_FDR01',
    'ASD associated genes (FDR≤0.3)': 'ASD_TADA_FDR03',
    'Midfetal co-expression genes': 'ASD_midfetal_coexpression',
    'Constrained genes (pLI ≥ 0.9)': 'PLI90Score',
    'Postsynaptic density genes': 'PSD',
    'Developmental delay genes': 'DDD',
    'Brain expressed genes': 'BE',
    'CHD8 targets': 'CHD8_targets',
    'FMRP targets': 'FMRP_targets',
}

# Make a dictionary which is a component of the new gene matrix
col_idx_dict = {colname: i for i, colname in enumerate(prev_gene_mat_df.columns.values)}
prev_gene_mat_values = prev_gene_mat_df.values
gene_mat_dict = {}

for prev_gene_entry in prev_gene_mat_values:
    gene_dict = {gene_list_name_dict[gene_list_name]: prev_gene_entry[col_idx_dict[gene_list_name]] for gene_list_name in gene_list_name_dict}
    gene_id = prev_gene_entry[col_idx_dict['GeneID']]
    same_gene_dict = gene_mat_dict.get(gene_id)
    
    if same_gene_dict is not None:
        for gene_list_name in gene_list_name_dict.values():
            gene_dict[gene_list_name] |= same_gene_dict[gene_list_name]
    
    gene_mat_dict[gene_id] = gene_dict

# Parse the GENCODE file and update the dictionary
gene_to_attr_dict = {}  # Key: GeneID, Value: A dictionary for the information in the 'attribute' columns

for attr_str in gencode_df['attribute'].values:
    attr_dict = parse_attr_str(attr_str)
    
    if not attr_dict['ID'].endswith('Y'):  # Ignore pseudoautosome region (PAR_Y)
        gene_id = attr_dict['ID'].split('.')[0]
        gene_to_attr_dict[gene_id] = attr_dict

for gene_id in gene_to_attr_dict:
    gene_dict = gene_mat_dict.get(gene_id)

    if gene_dict is None:
        gene_dict = {gene_list_name: 0 for gene_list_name in gene_list_name_dict.values()}

    gene_dict['gene_id'] = gene_to_attr_dict[gene_id]['ID']
    gene_dict['gene_name'] = gene_to_attr_dict[gene_id]['gene_name']
    gene_mat_dict[gene_id] = gene_dict

# Make a gene matrix
gene_mat_df = pd.DataFrame.from_dict(gene_mat_dict, orient='index')
gene_mat_df = gene_mat_df[['gene_id', 'gene_name'] + list(gene_list_name_dict.values())]  # Change the column order
gene_mat_df.dropna(inplace=True)  # Remove the rows that is not available in the GENCODE
gene_mat_df.head()

Unnamed: 0,gene_id,gene_name,ASD_TADA_FDR01,ASD_TADA_FDR03,ASD_midfetal_coexpression,PLI90Score,PSD,DDD,BE,CHD8_targets,FMRP_targets
ENSG00000275877,ENSG00000275877.1,AC018688.1,0,0,0,0,0,0,0,0,0
ENSG00000201285,ENSG00000201285.1,RNA5SP524,0,0,0,0,0,0,0,0,0
ENSG00000202198,ENSG00000202198.1,AL162581.1,0,0,0,0,0,0,0,0,0
ENSG00000121410,ENSG00000121410.12,A1BG,0,0,0,0,0,0,0,0,0
ENSG00000268895,ENSG00000268895.6,A1BG-AS1,0,0,0,0,0,0,0,0,0


### 3. Add GENCODE biotypes as gene list columns into the gene matrix

In [24]:
# Add GENCODE biotypes
with open(gene_type_conf_path) as gene_type_conf_file:
    biotype_dict = yaml.safe_load(gene_type_conf_file)

for biotype_category in biotype_dict:
    biotype_set = set(biotype_dict[biotype_category])
    gene_mat_val_dict = {gene_id: 1 if gene_to_attr_dict[gene_id]['gene_type'] in biotype_set else 0 for gene_id in gene_to_attr_dict}
    gene_mat_df[biotype_category] = pd.Series(gene_mat_val_dict)
    
gene_mat_df.head()

Unnamed: 0,gene_id,gene_name,ASD_TADA_FDR01,ASD_TADA_FDR03,ASD_midfetal_coexpression,PLI90Score,PSD,DDD,BE,CHD8_targets,FMRP_targets,Protein_Coding,Long_ncRNA,Small_ncRNA,Pseudogene,IG_TR_Gene
ENSG00000275877,ENSG00000275877.1,AC018688.1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
ENSG00000201285,ENSG00000201285.1,RNA5SP524,0,0,0,0,0,0,0,0,0,0,0,1,0,0
ENSG00000202198,ENSG00000202198.1,AL162581.1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
ENSG00000121410,ENSG00000121410.12,A1BG,0,0,0,0,0,0,0,0,0,1,0,0,0,0
ENSG00000268895,ENSG00000268895.6,A1BG-AS1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
