In [1]:
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
import anndata as ad
from collections import defaultdict
from itertools import product

## RegNetwork

In [3]:
with open('../data/networks/mouse/RegNetwork/mouse.node', 'r') as f:
    mouse_nodes = [x.rstrip() for x in f.readlines()]
with open('../data/networks/mouse/RegNetwork/mouse.source', 'r') as f:
    mouse_links = [x.rstrip() for x in f.readlines()]
    
RN_gene_dict = {x.split('\t')[1]: -1 for x in mouse_nodes}
RN_link_list = []
for link in mouse_links:
    gene_1, gene_1_code, gene_2, gene_2_code = link.split('\t')
    if RN_gene_dict[gene_1] == -1:
        RN_gene_dict[gene_1] = gene_1_code
    if RN_gene_dict[gene_2] == -1:
        RN_gene_dict[gene_2] = gene_2_code
    RN_link_list.append([gene_1, gene_2]) 
    
non_connected_node_count = 0
for x in RN_gene_dict.keys():
    if RN_gene_dict[x] == -1:
        non_connected_node_count += 1
print('# of Non-connected nodes: ' + str(non_connected_node_count))

g = nx.Graph()
for e in RN_link_list:
    g.add_edge(e[0], e[1])

# of Non-connected nodes: 0


## GSE70499

In [4]:
raw = pd.read_csv('../data/expression/GSE70499/GSE70499_FINAL_master_list_of_genes_counts_MIN.sense.George_WT_v_KO_timecourse.txt', sep='\t')

gene_dict = {}
for i in range(raw.shape[0]):
    gene_dict[raw.id[i]] = {
        'gene_name': raw.geneSymbol[i],
        'in_regnetwork': (raw.geneSymbol[i] in RN_gene_dict),
        'geneCoordinate': raw.geneCoordinate[i],
    }
    
del raw['geneSymbol']
del raw['geneCoordinate']

dt = raw.set_index('id').transpose().reset_index()
dt_meta = dt['index'].str.split('_', expand=True)
del dt['index']
dt_array = dt.to_numpy()

GSE70499_genename_to_geneid = defaultdict(list)
for gene_id in gene_dict:
    GSE70499_genename_to_geneid[gene_dict[gene_id]['gene_name']].append(gene_id)
    
GSE70499_links = []
for link in RN_link_list:
    if link[0] in GSE70499_genename_to_geneid and link[1] in GSE70499_genename_to_geneid:
        for gene_id_pair in product(GSE70499_genename_to_geneid[link[0]], GSE70499_genename_to_geneid[link[1]]):
            GSE70499_links.append(gene_id_pair)

GSE70499 = ad.AnnData(dt_array, dtype=int)
GSE70499.var_names = np.array(dt.columns, dtype=str)
GSE70499.obs_names = dt_meta[2].to_numpy()
GSE70499.obs['genotype'] = pd.Categorical(dt_meta[0])
GSE70499.obs['timepoint'] = pd.Categorical(
    dt_meta[1].str.replace('ZT', '').to_numpy(dtype=int)
)
GSE70499.uns['ground_truth'] = GSE70499_links

GSE70499.write_h5ad('../data/expression_processed/GSE70499.h5ad')

In [25]:
dt_70499

ValueError: Index contains duplicate entries, cannot reshape

In [13]:
pd.(dt_70499[['experiment_id', 'genotype', 'timepoint']])

ValueError: could not broadcast input array from shape (779544,3) into shape (779544,)

In [12]:
dt_70499

Unnamed: 0,geneSymbol,value,genotype,timepoint,experiment_id
0,Gnai3,1226,KO,ZT00,13016GP18
1,Cdc45,29,KO,ZT00,13016GP18
2,H19,2,KO,ZT00,13016GP18
3,Scml2,0,KO,ZT00,13016GP18
4,Apoh,41786,KO,ZT00,13016GP18
...,...,...,...,...,...
779539,AL627182.1,0,WT,ZT20,13016GP23
779540,AL645963.1,0,WT,ZT20,13016GP23
779541,AC110235.1,0,WT,ZT20,13016GP23
779542,AC120136.3,1,WT,ZT20,13016GP23


In [11]:
dt_70499.groupbypivot('experiment_id', 'geneSymbol', 'value')

ValueError: Index contains duplicate entries, cannot reshape

In [26]:
dt_70499_mean = dt_70499.groupby(['genotype', 'timepoint', 'geneSymbol']).aggregate(['min', 'mean', 'max'])

  dt_70499_mean = dt_70499.groupby(['genotype', 'timepoint', 'geneSymbol']).aggregate(['min', 'mean', 'max'])


In [28]:
dt_70499_mean.to_parquet('aaa.parquet')

In [32]:
dt_70499_mean

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,min,mean,max
genotype,timepoint,geneSymbol,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
KO,ZT00,0610005C13Rik,2463,2526.000000,2588
KO,ZT00,0610007C21Rik,1040,1055.000000,1082
KO,ZT00,0610007L01Rik,997,1144.666667,1339
KO,ZT00,0610007P08Rik,181,188.666667,196
KO,ZT00,0610007P14Rik,670,967.666667,1264
...,...,...,...,...,...
WT,ZT20,l7Rn6,273,298.666667,337
WT,ZT20,snoU6-77,0,1.000000,2
WT,ZT20,snoU89,0,0.333333,1
WT,ZT20,snoU90,6,13.000000,25


In [41]:
dt_70499_mean.droplevel(axis=1, level=0).reset_index().pivot(['genotype', 'timepoint'], 'geneSymbol', 'mean').reset_index()

geneSymbol,genotype,timepoint,0610005C13Rik,0610007C21Rik,0610007L01Rik,0610007P08Rik,0610007P14Rik,0610007P22Rik,0610008F07Rik,0610009B14Rik,...,Zyg11b,Zyx,Zzef1,Zzz3,a,l7Rn6,snoU6-77,snoU89,snoU90,snoU97
0,KO,ZT00,2526.0,1055.0,1144.666667,188.666667,967.666667,261.333333,104.0,0.0,...,738.0,340.0,563.333333,436.0,0.0,199.666667,1.333333,0.0,6.333333,0.333333
1,KO,ZT04,2687.666667,1089.0,831.666667,173.666667,1183.333333,255.333333,107.333333,0.0,...,711.0,300.0,543.0,381.666667,0.333333,206.0,0.333333,0.333333,2.666667,0.0
2,KO,ZT08,2620.333333,1040.333333,1007.333333,196.0,687.333333,242.666667,92.333333,0.0,...,751.666667,282.666667,617.666667,470.333333,0.0,177.666667,0.666667,1.0,5.333333,0.0
3,KO,ZT12,2849.333333,1009.0,947.666667,222.333333,702.333333,225.0,122.333333,0.333333,...,825.666667,316.333333,635.0,456.0,0.0,189.0,2.333333,0.666667,5.666667,0.666667
4,KO,ZT16,2203.333333,895.0,764.0,322.333333,649.666667,275.666667,95.666667,0.0,...,893.333333,70.666667,271.0,577.666667,0.0,298.666667,0.333333,0.0,12.333333,1.0
5,KO,ZT20,2266.0,1315.333333,809.333333,268.666667,889.333333,242.0,97.666667,0.0,...,801.0,113.666667,397.333333,563.333333,0.0,277.333333,0.333333,0.666667,7.333333,0.333333
6,WT,ZT00,2182.333333,995.0,1166.666667,206.0,1235.666667,246.0,103.333333,0.333333,...,775.0,479.666667,620.333333,406.666667,0.0,225.666667,2.0,0.0,8.333333,0.0
7,WT,ZT04,2322.666667,1097.0,966.666667,195.666667,1447.0,242.666667,83.0,0.0,...,776.333333,533.0,639.0,474.666667,0.0,198.0,1.0,1.333333,3.0,1.0
8,WT,ZT08,3075.333333,1055.0,942.333333,211.333333,1270.666667,229.0,114.666667,0.0,...,846.333333,470.0,597.666667,427.333333,0.0,169.666667,0.0,1.0,4.333333,1.0
9,WT,ZT12,2295.0,981.0,967.0,235.333333,1188.666667,179.0,90.333333,0.333333,...,760.333333,540.333333,586.666667,462.0,0.0,176.333333,1.666667,0.666667,11.333333,0.333333


In [11]:
raw_70499[['geneSymbol', 

Unnamed: 0,id,KO_ZT00_13016GP18,KO_ZT00_13016GP26,KO_ZT00_13016GP30,KO_ZT04_13016GP03,KO_ZT04_13016GP10,KO_ZT04_13016GP20,KO_ZT08_13016GP12,KO_ZT08_13016GP27,KO_ZT08_13016GP33,...,WT_ZT12_13016GP07,WT_ZT12_13016GP11,WT_ZT16_13016GP06,WT_ZT16_13016GP22,WT_ZT16_13016GP29,WT_ZT20_13016GP02,WT_ZT20_13016GP16,WT_ZT20_13016GP23,geneCoordinate,geneSymbol
0,gene:ENSMUSG00000000001,1226,1278,1301,1214,1252,1110,1316,1327,1329,...,1507,1535,2328,1689,1923,1798,1785,2015,chr3:107910198-107949064,Gnai3
1,gene:ENSMUSG00000000028,29,18,27,35,15,25,14,18,20,...,29,12,21,14,17,9,13,20,chr16:18780540-18812080,Cdc45
2,gene:ENSMUSG00000000031,2,5,5,0,4,6,2,2,6,...,21,7,7,1,0,2,1,4,chr7:149761434-149764048,H19
3,gene:ENSMUSG00000000037,0,0,1,2,0,2,0,0,0,...,0,0,0,0,0,3,0,3,chrX:157555125-157696145,Scml2
4,gene:ENSMUSG00000000049,41786,44855,41793,42850,49405,43384,38928,48644,42329,...,26476,30069,14662,35177,13556,48475,32005,30908,chr11:108204668-108275710,Apoh
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21649,gene:ENSMUSG00000093220,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,chr4:115193323-115193575,AL627182.1
21650,gene:ENSMUSG00000093244,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,chr11:108010923-108011153,AL645963.1
21651,gene:ENSMUSG00000093316,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,chr9:65181085-65181318,AC110235.1
21652,gene:ENSMUSG00000093333,0,3,0,0,2,0,0,1,0,...,0,0,3,0,1,0,1,1,chr1:72301582-72301772,AC120136.3
