In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import anndata as ad
from collections import defaultdict
from itertools import product

## RegNetwork

In [55]:
mouse_RN = np.char.upper(
    np.genfromtxt('../python/data/RN_mouse.tsv', dtype=str))
np.savetxt('../python/data/mouse_RN.tsv', mouse_RN, delimiter='\t', fmt='%s')

human_RN = np.char.upper(
    np.genfromtxt('../python/data/RN_human.tsv', dtype=str))
np.savetxt('../python/data/human_RN.tsv', human_RN, delimiter='\t', fmt='%s')

mouse_TRRUST = np.char.upper(
    np.genfromtxt('../python/data/trrust_mouse.tsv', dtype=str))[:, :2]
np.savetxt('../python/data/mouse_TRRUST.tsv', mouse_TRRUST, 
           delimiter='\t', fmt='%s')

human_TRRUST = np.char.upper(
    np.genfromtxt('../python/data/trrust_human.tsv', dtype=str))[:, :2]
np.savetxt('../python/data/human_TRRUST.tsv', human_TRRUST, 
           delimiter='\t', fmt='%s')

mouse_BEELINE = np.char.upper(
    np.genfromtxt('../python/data/BEELINE_non_specific_mouse.csv', dtype=str,
                 delimiter=',', skip_header=True))
np.savetxt('../python/data/mouse_BEELINE.tsv', mouse_BEELINE, 
           delimiter='\t', fmt='%s')

human_BEELINE = np.char.upper(
    np.genfromtxt('../python/data/BEELINE_non_specific_human.csv', dtype=str,
                 delimiter=',', skip_header=True))
np.savetxt('../python/data/human_BEELINE.tsv', human_BEELINE, 
           delimiter='\t', fmt='%s')

mouse_STRING = np.char.upper(
    np.genfromtxt('../python/data/BEELINE_STRING_mouse.csv', dtype=str,
                 delimiter=',', skip_header=True))
np.savetxt('../python/data/mouse_STRING.tsv', mouse_STRING, 
           delimiter='\t', fmt='%s')

human_STRING = np.char.upper(
    np.genfromtxt('../python/data/BEELINE_STRING_human.csv', dtype=str,
                 delimiter=',', skip_header=True))
np.savetxt('../python/data/human_STRING.tsv', human_STRING, 
           delimiter='\t', fmt='%s')

In [17]:
with open('../data/networks/mouse/RegNetwork/mouse.node', 'r') as f:
    mouse_nodes = [x.rstrip() for x in f.readlines()]
with open('../data/networks/mouse/RegNetwork/mouse.source', 'r') as f:
    mouse_links = [x.rstrip() for x in f.readlines()]
    
RN_gene_dict = {x.split('\t')[1]: -1 for x in mouse_nodes}
RN_link_list = []
for link in mouse_links:
    gene_1, gene_1_code, gene_2, gene_2_code = link.split('\t')
    if RN_gene_dict[gene_1] == -1:
        RN_gene_dict[gene_1] = gene_1_code
    if RN_gene_dict[gene_2] == -1:
        RN_gene_dict[gene_2] = gene_2_code
    RN_link_list.append([gene_1, gene_2]) 
    
non_connected_node_count = 0
for x in RN_gene_dict.keys():
    if RN_gene_dict[x] == -1:
        non_connected_node_count += 1
print('# of Non-connected nodes: ' + str(non_connected_node_count))

g = nx.Graph()
for e in RN_link_list:
    g.add_edge(e[0], e[1])

# of Non-connected nodes: 0


In [18]:
import pickle

In [19]:
with open('g', 'wb') as f:
    pickle.dump(g, f)

In [43]:
aaa = np.genfromtxt('../python/data/RN_mouse.tsv', dtype=str)

In [None]:
aaa.

## GSE70499

In [92]:
import urllib
import gzip
import os
import shutil
import numpy as np
import anndata as ad
from itertools import product
import pickle

# Load existing network data. 
def load_network(species, benchmark):
    return np.genfromtxt('data/{}_{}.tsv'.format(species, benchmark), dtype=str)

# General util functions to download raw data from GEO. 
def get_geo_url(gse_id, file_name):
    geo_url = 'https://www.ncbi.nlm.nih.gov/geo/download/?'
    param = {'acc': gse_id, 'format':'file', 'file': file_name}
    return geo_url + urllib.parse.urlencode(param, quote_via=urllib.parse.quote)

def download_geo(dir_path, gse_id, file_name):
    print('Downloading file from ' + gse_id, '...')
    download_url = get_geo_url(gse_id, file_name)
    local_gz_path = os.path.join(dir_path, file_name)
    local_file_path = os.path.join(dir_path, file_name.rstrip('.gz'))
                                   
    urllib.request.urlretrieve(download_url, local_gz_path)
    
    with gzip.open(local_gz_path, 'rb') as f_in:
        with open(local_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
            
    return local_file_path

def build_initial_ground_truth(g2g, species, bm):
    original_bm_links = load_network(species, bm)
    links = []
    for link in original_bm_links:
        if link[0] in g2g and link[1] in g2g:
            for gene_id_pair in product(g2g[link[0]], g2g[link[1]]):
                    links.append(gene_id_pair)
    return links

class BenchmarkData(ad.AnnData):
    def __init__(self, X=None):
        super().__init__(X=X, dtype=X.dtype)
        
    def update_gene_meta(self):
        if len(self.uns['gene_dict']) != self.n_vars:
            self.uns['gene_dict'] = {
                k: self.uns['gene_dict'][k] for k in self.var_names}
            self.update_gene_name2id()
            
    def update_gene_name2id(self):
        name2id = {}
        for gene_id in self.uns['gene_dict']:
            gene_name = self.uns['gene_dict'][gene_id]['gene_name']
            if gene_name not in name2id:
                name2id[gene_name] = []
            name2id[gene_name].append(gene_id)
        self.uns['gene_name2id'] = name2id
        
    def get_ground_truth(self, bm):
        self.update_gene_meta()
        updated_links = []
        g2g = self.uns['genename_to_geneid']
        for link in self.uns[bm]:
            if link[0] in g2g and link[1] in g2g:
                for gene_id_pair in product(g2g[link[0]], g2g[link[1]]):
                    updated_links.append(gene_id_pair)
        return updated_links

# load benchmarks
def load_gse_70499(dir_path, force_reload=False):
    gse_id = 'GSE70499'
    file_name = 'GSE70499_FINAL_master_list_of_genes_counts_MIN.sense.George_WT_v_KO_timecourse.txt'
    dir_path = os.path.join(dir_path, gse_id)
    processed_file_path = os.path.join(dir_path, gse_id+'.pkl')
    
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)
        
    if os.path.exists(processed_file_path) and force_reload==False:
        print('Loading from processed file...')
        with open(processed_file_path, 'rb') as f:
            dt = pickle.load(f)
        return dt
    
    raw_file = download_geo(dir_path, gse_id, file_name)
    
    print('Processing...')
    raw = pd.read_csv(raw_file, sep='\t')
    
    raw.id = raw.id.str.replace('gene:', '')
    raw.geneSymbol = raw.geneSymbol.str.upper()

    gene_dict = {}
    for i in range(raw.shape[0]):
        gene_dict[raw.id[i]] = {
            'gene_name': raw.geneSymbol[i],
            'in_regnetwork': (raw.geneSymbol[i] in RN_gene_dict),
            'geneCoordinate': raw.geneCoordinate[i],
        }

    del raw['geneSymbol']
    del raw['geneCoordinate']

    dt = raw.set_index('id').transpose().reset_index()
    dt_meta_ids = dt['index']
    dt_meta = dt_meta_ids.str.split('_', expand=True)
    del dt['index']
    dt_array = dt.to_numpy()

    ad_dt = BenchmarkData(dt_array)
    ad_dt.var_names = np.array(dt.columns, dtype=str)
    ad_dt.obs_names = dt_meta[2].to_numpy()
    ad_dt.obs['genotype'] = pd.Categorical(dt_meta[0])
    ad_dt.obs['timepoint'] = pd.Categorical(
        dt_meta[1].str.replace('ZT', '').to_numpy(dtype=int)
    )
    ad_dt.uns['gene_dict'] = gene_dict
    ad_dt.update_gene_name2id()
    
    print('Adding benchmarks...')
    ad_dt.uns['benchmark_RN'] = build_initial_ground_truth(
        ad_dt.uns['genename_to_geneid'], 'mouse', 'RN'
    )
    ad_dt.uns['benchmark_TRRUST'] = build_initial_ground_truth(
        ad_dt.uns['genename_to_geneid'], 'mouse', 'TRRUST'
    )
    ad_dt.uns['benchmark_BEELINE'] = build_initial_ground_truth(
        ad_dt.uns['genename_to_geneid'], 'mouse', 'BEELINE'
    )
    ad_dt.uns['benchmark_STRING'] = build_initial_ground_truth(
        ad_dt.uns['genename_to_geneid'], 'mouse', 'STRING'
    )
    with open(processed_file_path, 'wb') as f:
        pickle.dump(ad_dt, f)
    print('Complete!)
    return ad_dt

In [85]:
aaa = ad.AnnData(np.zeros((10, 10)))

  aaa = ad.AnnData(np.zeros((10, 10)))


In [86]:
aaa.n_vars

10

In [6]:
raw = pd.read_csv('../data/expression/GSE70499/GSE70499_FINAL_master_list_of_genes_counts_MIN.sense.George_WT_v_KO_timecourse.txt', sep='\t')

gene_dict = {}
for i in range(raw.shape[0]):
    gene_dict[raw.id[i]] = {
        'gene_name': raw.geneSymbol[i],
        'in_regnetwork': (raw.geneSymbol[i] in RN_gene_dict),
        'geneCoordinate': raw.geneCoordinate[i],
    }
    
del raw['geneSymbol']
del raw['geneCoordinate']

dt = raw.set_index('id').transpose().reset_index()
dt_meta = dt['index'].str.split('_', expand=True)
del dt['index']
dt_array = dt.to_numpy()

GSE70499_genename_to_geneid = defaultdict(list)
for gene_id in gene_dict:
    GSE70499_genename_to_geneid[gene_dict[gene_id]['gene_name']].append(gene_id)
    
GSE70499_links = []
for link in RN_link_list:
    if link[0] in GSE70499_genename_to_geneid and link[1] in GSE70499_genename_to_geneid:
        for gene_id_pair in product(GSE70499_genename_to_geneid[link[0]], GSE70499_genename_to_geneid[link[1]]):
            GSE70499_links.append(gene_id_pair)

GSE70499 = ad.AnnData(dt_array, dtype=int)
GSE70499.var_names = np.array(dt.columns, dtype=str)
GSE70499.obs_names = dt_meta[2].to_numpy()
GSE70499.obs['genotype'] = pd.Categorical(dt_meta[0])
GSE70499.obs['timepoint'] = pd.Categorical(
    dt_meta[1].str.replace('ZT', '').to_numpy(dtype=int)
)
GSE70499.uns['ground_truth'] = GSE70499_links

GSE70499.write_h5ad('../data/expression_processed/GSE70499.h5ad')

## GSE73554

In [33]:
raw_ko = pd.read_csv('../data/expression/GSE73554/GSE73554_KO_RF_Intron_Exon_RFP.txt', sep='\t')
raw_wt = pd.read_csv('../data/expression/GSE73554/GSE73554_WT_RF_Intron_Exon_RFP.txt', sep='\t')


In [34]:
gene_dict = {}
for i in range(raw_ko.shape[0]):
    gene_dict[raw_ko.Gene_Ensembl[i]] = {
        'gene_name': raw_ko.Gene_Symbol[i],
        'in_regnetwork': (raw_ko.Gene_Symbol[i] in RN_gene_dict),
    }
    
del raw_ko['Gene_Symbol']
del raw_wt['Gene_Symbol']

raw = pd.merge(raw_ko, raw_wt, 'left', 'Gene_Ensembl')

In [41]:
dt = raw.set_index('Gene_Ensembl').transpose().reset_index()
dt_meta_names = dt['index']
dt_meta = dt_meta_names.str.split('_', expand=True)
del dt['index']
dt_array = dt.to_numpy()

In [38]:
GSE73554_genename_to_geneid = defaultdict(list)
for gene_id in gene_dict:
    GSE73554_genename_to_geneid[gene_dict[gene_id]['gene_name']].append(gene_id)

In [39]:
GSE73554_links = []
for link in RN_link_list:
    if link[0] in GSE73554_genename_to_geneid and link[1] in GSE73554_genename_to_geneid:
        for gene_id_pair in product(GSE73554_genename_to_geneid[link[0]], GSE73554_genename_to_geneid[link[1]]):
            GSE73554_links.append(gene_id_pair)

In [47]:
GSE73554 = ad.AnnData(dt_array)
GSE73554.var_names = np.array(dt.columns, dtype=str)
GSE73554.obs_names = dt_meta_names.to_numpy()
GSE73554.obs['genotype'] = pd.Categorical(dt_meta[0])
GSE73554.obs['genosite'] = pd.Categorical(dt_meta[2])

GSE73554.obs['timepoint'] = pd.Categorical(
    dt_meta[3].to_numpy(dtype=int)
)
GSE73554.obs['sample'] = pd.Categorical(dt_meta[4])
GSE73554.uns['ground_truth'] = GSE73554_links

GSE73554.write_h5ad('../data/expression_processed/GSE73554.h5ad')

  GSE73554 = ad.AnnData(dt_array)


In [46]:
GSE73554

AnnData object with n_obs × n_vars = 108 × 13480
    obs: 'genotype', 'genosite', 'timepoint', 'sample'
    uns: 'ground_truth'