In [2]:
import numpy as np
import pandas as pd
import torch
import pickle
from tqdm import tqdm
import time
import scipy.sparse as sp
from preProcessing.gen_adj import generate_adj
from preProcessing.get_BA import preprocess_bilinear

# Tạo mạng Gene-Outlying Gene cho LUAD

In [None]:
luad_outlying_gene = pd.read_csv('data/luad/LUAD_outlying gene_names.csv', index_col=0)
pan_outlying_gene = pd.read_csv('data/outlying gene_names.csv')
gene_name = pd.read_csv('data/gene_names.txt', header=None)
pp = pd.read_csv('data/luad/PP.txt', delimiter='\t', header=None)   # ppi network

### Tao matran PO
# tim index cua outlying gene va mutated gene trong file gene_name de so voi pp.txt
o_index = []
for o in luad_outlying_gene.iloc[:,0]:
    for i,n in enumerate(gene_name.iloc[:,1]):
        if o == n:
            o_index.append(i)
            
# lay cac gen bi dot bien o LUAD
m_index = []
P_feature = pd.read_csv('data/pan-cancer/P.feat-final.csv', index_col=0).iloc[:,6].to_numpy()
for idx,val in enumerate(P_feature):
    if val != 0: m_index.append(idx)   

mask0 = [(i in m_index) for i in pp.iloc[:,0]]  # danh dau lien ket cua mutated gene
mask1 = [(i in o_index) for i in pp.iloc[:,1]]  # danh dau lien ket cua outlying gene

# gene-outlying gene co lien ket neu: (1) gene la mutated gene, (2) gene co lien ket voi outlying gene trong PPI
mm = []
for i,j in zip(mask0, mask1):
    if i == True & j == True:
        mm.append(True)
    else:
        mm.append(False)
po = pp[mm]

# thay doi index cua Outlying Gene theo dung thu tu cua trong file LUAD_outlying_gene_names.csv
for i,o in enumerate(po.iloc[:,1]):
    for idx,g in enumerate(o_index):
        if o == g: po.iloc[i,1] = idx

# luu index cua Outlying 
with open('data/luad/O.txt', 'w') as file:
    for i in range(luad_outlying_gene.shape[0]):
        file.write(str(i) + '\t' + str(i + 1) + '\n')

po.to_csv('data/luad/PO.txt', sep='\t', header=None, index=None)   # luu matrix ke gene-luad_outlying_gene
print('Done: Generate PO.txt')
print('So canh: ', len(po))

generate_adj('PO', 'data/luad/', 'data/luad/')  # chuan hoa ma tran gene-luad_outlying_gene
print('Done: Normalize PO')

### thuoc tinh outlying gene
# tim index cua luad outlying gene trong file outlying gene name
oo_index = []
for i,o1 in  enumerate(luad_outlying_gene.iloc[:,0]):
    for j,o2 in enumerate(pan_outlying_gene.iloc[:,1]):
        if o1==o2:
            oo_index.append(j)

O_feature = pd.read_csv('data/pan-cancer/O.feat-final.csv', index_col=0)
O_feature = O_feature.iloc[oo_index, [6,22]].rename(columns={'6':'0', '22':'1'}).reset_index(drop=True)
O_feature.to_csv('data/luad/O.feat-final.csv')
print('Done: Generate the Feature of Outlying Gene')

### preProcessing chuan bi dau vao cho mo hinh Bilinear 
adj1 = sp.load_npz("data/luad/PO.adj.npz").toarray()    
adj1 = np.where(adj1 > 0, 1, 0)
adj2 = np.zeros((adj1.shape[0], adj1.shape[0]))
adj3 = np.zeros((adj1.shape[1], adj1.shape[1]))
adj4 = np.transpose(adj1)

# concatenated into a large matrix
adj_2_4 = np.vstack((adj2, adj4))
adj_1_3 = np.vstack((adj1, adj3))
adj = np.hstack((adj_2_4, adj_1_3))
adj = torch.Tensor(adj)

x,y = preprocess_bilinear(adj)
sp.save_npz("data/luad/O.adj_loop.npz", x)   # x chinh la thang adj + matran duong cheo voi gia tri duong cheo = 1
sp.save_npz("data/luad/O.N_all.npz", y)      # y luu bac cua tung dong trong adj, gduong cheo chinh co gia tri bang: 2/(tong_hang_adj * (tong_hang_adj - 1))
print('Done: Preprocessing for Bilinear model')

# Tao thuoc tinh cho Gene, miRNA cua LUAD

In [5]:
P = pd.read_csv('data/pan-cancer/P.feat-final.csv', index_col=0)
R = pd.read_csv('data/pan-cancer/R.feat-final.csv', index_col=0)

p = pd.concat([P.iloc[:, [6,22,38]],P.iloc[:, 48:64]] , axis=1, ignore_index=True)
p.to_csv('data/luad/P.feat-final.csv')

r = R.iloc[:,[6,22,38,48]]
r.to_csv('data/luad/R.feat-final.csv')

# pretrain
import subprocess
subprocess.run(['python3', 'pretrain.py'])

100%|██████████| 2000/2000 [01:57<00:00, 16.96it/s]


CompletedProcess(args=['python3', 'pretrain.py'], returncode=0)

# Tao Similarity Gene-Gene Graph

In [None]:
threshold = [0.7, 0.8, 0.9]

GG = pd.read_csv('data/Similarity_matrix.csv', index_col=0) 
GG.reset_index(drop=True, inplace=True)
GG.columns = [i for i in range(13627)]

for t in threshold:
    edge = GG.stack()[(GG.stack() >= t)]
    edge = list(edge.index)
    index = []
    for i in edge:
        if i[0] != i[1]: index.append(list(i))

    with open(f'data/luad/GG_{t}.txt', 'w') as file:
        for sublist in index:
            file.write('\t'.join(map(str, sublist)) + '\n')

In [5]:
# chuan hoa 
for t in threshold: 
    generate_adj(f'GG_{t}', 'data/luad/', 'data/luad/')  # chuan hoa ma tran gene-luad_outlying_gene
    print(f'Done: Normalize GG_{t}.txt')


Done: Normalize GG_0.7
Done: Normalize GG_0.8
Done: Normalize GG_0.9
