In [1]:
import numpy as np
import scipy.sparse
import scipy.io

import pandas as pd
from datasets_dev import RNA_Dataset, ATAC_Dataset

In [2]:
path = '../data/Paired-seq/'

In [9]:
#RNA
fetal_rna = scipy.io.mmread(path+'Fetal_Forebrain/FB_RNA/matrix.mtx')
adult_rna = scipy.io.mmread(path+'Adult_Cerebrail_Cortex/Adult_CTX_RNA/matrix.mtx')

In [10]:
fetal_gene = pd.read_table(path + 'Fetal_Forebrain/FB_RNA/genes.tsv', header=None)
adult_gene = pd.read_table(path + 'Adult_Cerebrail_Cortex/Adult_CTX_RNA/genes.tsv', header=None)

In [11]:
print(fetal_gene)
print(adult_gene)

                        0           1
0      ENSMUSG00000000001       Gnai3
1      ENSMUSG00000000003        Pbsn
2      ENSMUSG00000000028       Cdc45
3      ENSMUSG00000000031         H19
4      ENSMUSG00000000037       Scml2
...                   ...         ...
29831  ENSMUSG00000116984  CT030713.2
29832  ENSMUSG00000116987  AC150035.3
29833  ENSMUSG00000116988  AC164314.2
29834  ENSMUSG00000116989  AC131339.4
29835  ENSMUSG00000116993  AC135964.2

[29836 rows x 2 columns]
                        0           1
0      ENSMUSG00000000001       Gnai3
1      ENSMUSG00000000003        Pbsn
2      ENSMUSG00000000028       Cdc45
3      ENSMUSG00000000031         H19
4      ENSMUSG00000000037       Scml2
...                   ...         ...
29619  ENSMUSG00000116984  CT030713.2
29620  ENSMUSG00000116987  AC150035.3
29621  ENSMUSG00000116988  AC164314.2
29622  ENSMUSG00000116989  AC131339.4
29623  ENSMUSG00000116993  AC135964.2

[29624 rows x 2 columns]


In [12]:
fetal_gene = fetal_gene.values[:,0]
adult_gene = adult_gene.values[:,0]

In [14]:
co = set(fetal_gene) & set (adult_gene)
#print(co)
print(len(co))

29589


In [15]:
f_id = []
for i, gene in enumerate(fetal_gene):
    if gene in co:
        f_id.append(i)
print(len(f_id))

29589


In [16]:
a_id = []
for i, gene in enumerate(adult_gene):
    if gene in co:
        a_id.append(i)
print(len(a_id))

29589


In [17]:
print(fetal_rna.shape)
print(type(fetal_rna))

(29836, 25845)
<class 'scipy.sparse.coo.coo_matrix'>


In [18]:
print(adult_rna.shape)
print(type(adult_rna))

(29624, 15191)
<class 'scipy.sparse.coo.coo_matrix'>


In [19]:
fetal_rna = fetal_rna.transpose().tocsr()
adult_rna = adult_rna.transpose().tocsr()

In [20]:
print(fetal_rna.shape)
print(adult_rna.shape)

(25845, 29836)
(15191, 29624)


In [21]:
fetal_rna = fetal_rna[:,f_id]

In [22]:
adult_rna = adult_rna[:,a_id]

In [23]:
#adult_rna = np.hstack([adult_rna, zero])

In [24]:
print(fetal_rna.shape)
print(adult_rna.shape)

(25845, 29589)
(15191, 29589)


In [27]:
scipy.io.mmwrite(path + '/processed_fetal/RNA-seq/matrix.mtx', fetal_rna.transpose().astype('int'))
scipy.io.mmwrite(path + '/processed_adult/RNA-seq/matrix.mtx', adult_rna.transpose().astype('int'))

In [25]:
rna = scipy.sparse.vstack([fetal_rna, adult_rna]).transpose().astype('int')

In [26]:
rna.shape

(29589, 41036)

In [None]:
savepath = path + 'combined/RNA-seq'

In [None]:
scipy.io.mmwrite(savepath + '/matrix.mtx', rna)

In [None]:
#save genes
gene =  pd.read_table(path + 'Fetal_Forebrain/FB_RNA/genes.tsv', header=None)
gene = gene.iloc[f_id,:]
print(gene)
gene.to_csv(savepath + '/genes.tsv', sep='\t' , header=False, index=False)

In [None]:
#process barcode
fetal_barcode = pd.read_table(path+'Fetal_Forebrain/FB_RNA/barcodes.tsv', header=None)
adult_barcode = pd.read_table(path+'Adult_Cerebrail_Cortex/Adult_CTX_RNA/barcodes.tsv', header=None)

In [None]:
barcode = pd.concat([fetal_barcode, adult_barcode], axis=0)

In [None]:
barcode

In [None]:
barcode.to_csv(savepath + '/barcodes.tsv', sep='\t' ,header=False, index=False)

In [28]:
#ATAC 
fetal_atac = scipy.io.mmread(path+'Fetal_Forebrain/FB_DNA/matrix.mtx')
adult_atac = scipy.io.mmread(path+'Adult_Cerebrail_Cortex/Adult_CTX_DNA/matrix.mtx')

In [29]:
fetal_gene = pd.read_table(path + 'Fetal_Forebrain/FB_DNA/genes.tsv', header=None)
adult_gene = pd.read_table(path + 'Adult_Cerebrail_Cortex/Adult_CTX_DNA/genes.tsv', header=None)

In [30]:
print(fetal_gene)
print(adult_gene)

                                 0                          1
0          chr10:10000000-10001000    chr10:10000000-10001000
1        chr10:100000000-100001000  chr10:100000000-100001000
2        chr10:100001000-100002000  chr10:100001000-100002000
3        chr10:100002000-100003000  chr10:100002000-100003000
4        chr10:100003000-100004000  chr10:100003000-100004000
...                            ...                        ...
2637310       chrY:9995000-9996000       chrY:9995000-9996000
2637311       chrY:9996000-9997000       chrY:9996000-9997000
2637312       chrY:9997000-9998000       chrY:9997000-9998000
2637313       chrY:9998000-9999000       chrY:9998000-9999000
2637314      chrY:9999000-10000000      chrY:9999000-10000000

[2637315 rows x 2 columns]
                                 0                          1
0          chr10:10000000-10001000    chr10:10000000-10001000
1        chr10:100000000-100001000  chr10:100000000-100001000
2        chr10:100002000-100003000  chr10:

In [31]:
fetal_gene.iloc[:,0] == fetal_gene.iloc[:,1]
sum(fetal_gene.iloc[:,0] == fetal_gene.iloc[:,1])

2637315

In [32]:
sum(adult_gene.iloc[:,0] == adult_gene.iloc[:,1])

2614863

In [33]:
fetal_gene = fetal_gene.values[:,0]
adult_gene = adult_gene.values[:,0]

In [34]:
co = set(fetal_gene) & set (adult_gene)
#print(co)
print(len(co))

2613805


In [35]:
f_id = []
for i, gene in enumerate(fetal_gene):
    if gene in co:
        f_id.append(i)
print(len(f_id))

2613805


In [36]:
a_id = []
for i, gene in enumerate(adult_gene):
    if gene in co:
        a_id.append(i)
print(len(a_id))

2613805


In [37]:
fetal_atac = fetal_atac.transpose().tocsr()
adult_atac = adult_atac.transpose().tocsr()

In [38]:
print(fetal_atac.shape)
print(adult_atac.shape)

(25845, 2637315)
(15191, 2614863)


In [39]:
fetal_atac = fetal_atac[:,f_id]
adult_atac = adult_atac[:,a_id]

In [40]:
scipy.io.mmwrite(path + '/processed_fetal/ATAC-seq/matrix.mtx', fetal_atac.transpose().astype('int'))
scipy.io.mmwrite(path + '/processed_adult/ATAC-seq/matrix.mtx', adult_atac.transpose().astype('int'))

In [None]:
atac = scipy.sparse.vstack([fetal_atac, adult_atac]).transpose().astype('int')

In [None]:
atac.shape

In [None]:
savepath = path + 'combined/ATAC-seq'
scipy.io.mmwrite(savepath + '/matrix.mtx', atac)

In [None]:
#save genes
gene =  pd.read_table(path + 'Fetal_Forebrain/FB_DNA/genes.tsv', header=None)
gene = gene.iloc[f_id,:]
print(gene)
gene.to_csv(savepath + '/peaks.tsv', sep='\t', header=False, index=False)

In [None]:
#process barcode
fetal_barcode = pd.read_table(path+'Fetal_Forebrain/FB_DNA/barcodes.tsv', header=None)
adult_barcode = pd.read_table(path+'Adult_Cerebrail_Cortex/Adult_CTX_DNA/barcodes.tsv', header=None)

barcode = pd.concat([fetal_barcode, adult_barcode], axis=0)

In [None]:
barcode

In [None]:
barcode.to_csv(savepath + '/barcodes.tsv', sep='\t', header=False, index=False)

In [3]:
#Create toy dataset
import torch

In [138]:
#rna_path = '../data/Paired-seq/combined/RNA-seq'
#atac_path = '../data/Paired-seq/combined/ATAC-seq'
rna_path = '../data/Paired-seq/processed_adult/RNA-seq'
atac_path = '../data/Paired-seq/processed_adult/ATAC-seq'

In [139]:
r_dataset = RNA_Dataset(rna_path, min_reads=150,min_cells=50)

Loading  data ...
Original data contains 15191 cells x 29589 peaks
Finished loading takes 0.11 min


In [None]:
a_dataset = ATAC_Dataset(atac_path, low=0.002, high=1, min_peaks=200)

Loading  data ...


In [None]:
print(r_dataset.data.shape)
print(a_dataset.data.shape)
print(r_dataset.indices)
print(a_dataset.indices)

In [None]:
co_indice = set(r_dataset.indices) & set(a_dataset.indices)
co_indice = list(co_indice)

In [None]:
import torch
from torch.utils.data import Subset

In [None]:
r_indice = [list(r_dataset.indices).index(i) for i in co_indice]

In [None]:
r_dataset.data = r_dataset.data[r_indice,:]

In [None]:
a_indice = [list(a_dataset.indices).index(i) for i in co_indice]

In [None]:
a_dataset.data = a_dataset.data[a_indice,:]

In [None]:
torch.save(r_dataset,  '../data/Paired-seq/processed_adult/r_dataset_.rar') 

In [None]:
torch.save(a_dataset,  '../data/Paired-seq/processed_adult/a_dataset_.rar') 

In [None]:
r_dataset.data.shape

In [None]:
a_dataset.data.shape

In [None]:
#Remove outliers
keep = pd.read_csv('../data/Paired-seq/processed_adult/keep_id.csv',header=None,index_col=None).values
keep = np.squeeze(keep)
len(keep)

In [None]:
r_dataset.data = r_dataset.data[keep,:]
r_dataset.data.shape

In [None]:
a_dataset.data = a_dataset.data[keep,:]
a_dataset.data.shape

In [None]:
torch.save(r_dataset,  '../data/Paired-seq/processed_adult/r_dataset.rar') 
torch.save(a_dataset,  '../data/Paired-seq/processed_adult/a_dataset.rar') 

In [None]:
save_path = '../data/Paired-seq/combined/'

In [None]:
torch.save(r_dataset, save_path + 'r_dataset.rar') #combined

In [None]:
torch.load(save_path + 'r_dataset.rar') #combined

In [None]:
torch.save(a_dataset, save_path+'a_dataset.rar') #combined

In [None]:
torch.load(save_path + 'a_dataset.rar') #combined