In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.decomposition
import sklearn.manifold
import scprep
import time
import os
import re

In [2]:
np.random.seed(888)
random_state = 888

mtx_dir = '/Users/6j9/projects/mouse/data/human_ad'
mtx_paths = os.listdir(mtx_dir)
mtx_paths = [os.path.join(mtx_dir, mtx_path) for mtx_path in mtx_paths if 'clean' in mtx_path]

# Check Matrix Shapes

Turns out matrices are genes x cells (rows x columns) and the gene sets are different in each matrix.

In [14]:
for mtx_path in mtx_paths:
    matrix = pd.read_csv(mtx_path, header=0, index_col=0)
    print(matrix.shape)

(11465, 339)
(16193, 4740)
(15383, 2578)
(15931, 2981)
(14778, 4541)
(15106, 2302)
(16165, 4147)
(9462, 119)
(14814, 1737)
(15812, 1971)
(14814, 854)
(14862, 3125)
(14706, 1605)
(16717, 7539)
(14635, 1359)
(11307, 635)
(16962, 4760)


# Create Union of Genes List
Union of genes is length 18524.

In [21]:
for i, mtx_path in enumerate(mtx_paths):
    if i == 0:
        gene_union = pd.read_csv(mtx_path, header=0, index_col=None, usecols=[0])
    else:
        genes = pd.read_csv(mtx_path, header=0, index_col=None, usecols=[0])
        gene_union = np.union1d(gene_union, genes)

# Fill in the missing genes in each matrix

In [75]:
out_dir = '/Users/6j9/projects/mouse/data/human_ad/zero-filled_gene_union_matrices'
for i, mtx_path in enumerate(mtx_paths):
    print(i)
    _, tail = os.path.split(mtx_path)
    out_path = os.path.join(out_dir, 'gene-union_' + tail[:-3] + 'tsv')
    mtx = pd.read_csv(mtx_path, header=0, index_col=0)
    mtx = mtx.reindex(gene_union, fill_value=0)
    mtx.to_csv(out_path, sep='\t')

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


# De log-transform and de-scale by 10000

In [15]:
for i, mtx_path in enumerate(mtx_paths):
    print(i)
    mtx = pd.read_csv(mtx_path, header=0, index_col=0)
    #print(mtx)
    #mtx = mtx ** 2
    #mtx = np.exp(mtx) - 1
    #mtx = mtx * 10000
    #print(mtx)
    break

0


In [16]:
mtx

Unnamed: 0,S13_CATGGCGTCGGGAGTA,S13_AAGGAGCAGACCCACC,S13_CTTTGCGGTTGTCTTT,S13_TACCTATGTTCGGCAC,S13_CCTATTATCGTACCGG,S13_ATGTGTGAGACGACGT,S13_GTACTTTCAGCCTATA,S13_CTGGTCTTCTCGTTTA,S13_CGTTAGATCGCCAGCA,S13_CCCATACGTCGCCATG,...,S13_TTGTAGGTCTCCTATA,S13_AGCGTCGAGCCGTCGT,S13_AACTCCATCACAGGCC,S13_GGACGTCCAGTCTTCC,S13_AAGTCTGTCATCTGCC,S13_GTGTGTGAGACGACGT,S13_TGGGTGAAGCTAACAA,S13_AACTCATCAAGGTGTG,S13_TGCAGCCCATGCCACG,S13_GGGGTGAAGCTAACAA
A1BG,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
A2M,0.000000,0.0,0.000000,0.000000,2.210128,3.149373,1.368123,3.079100,0.00000,3.461755,...,4.394449,0.0,4.635125,0.0,0.0,0.000000,0.0,0.00000,3.956653,0.0
A4GALT,0.000000,0.0,0.000000,0.000000,0.000000,1.173145,1.590044,0.831245,0.00000,1.336743,...,0.000000,0.0,0.000000,0.0,0.0,4.378773,0.0,4.12076,0.000000,0.0
AAAS,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.63507,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
AACS,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZXDC,0.000000,0.0,0.000000,0.609029,0.769902,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
ZYG11B,0.000000,0.0,0.715044,0.000000,0.769902,0.749561,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
ZYX,0.000000,0.0,0.000000,0.000000,0.000000,0.749561,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0
ZZEF1,0.000000,0.0,0.000000,0.000000,0.769902,0.000000,0.000000,0.000000,0.00000,0.000000,...,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.00000,0.000000,0.0


# Generating Fibroblast and Pericyte Barcodes

In [56]:
# Let's just try PRRX1+ cells first
out_path = '/Users/6j9/projects/mouse/data/human_ad/prrx1_bcs.txt'
all_bcs = []
for i, mtx_path in enumerate(mtx_paths):
    print(i)
    mtx = pd.read_csv(mtx_path, header=0, index_col=0)
    bcs = mtx.loc['PRRX1', :]
    bcs = bcs[bcs > 0].index.values
    for bc in bcs:
        all_bcs.append(bc)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


In [86]:
# COL1A2+ COL3A1+ cells for mesenchymal cells
out_path = '/Users/6j9/projects/mouse/data/human_ad/mesenchymal_bcs.txt'
all_bcs = []
for i, mtx_path in enumerate(mtx_paths):
    print(i)
    mtx = pd.read_csv(mtx_path, header=0, index_col=0)
    bcs = mtx.loc[['COL1A2', 'COL3A1'], :]
    bcs.loc['sum', :] = np.sum(bcs, axis=0)
    #print(bcs)
    bcs = bcs[bcs.loc['sum'] > 0]#.index.values
    #for bc in bcs:
        #all_bcs.append(bc)
    break

0


  # Remove the CWD from sys.path while we load stuff.


IndexingError: Unalignable boolean Series provided as indexer (index of the boolean Series and of the indexed object do not match).

In [82]:
bcs.loc['sum'] > 0

S13_CATGGCGTCGGGAGTA    False
S13_AAGGAGCAGACCCACC    False
S13_CTTTGCGGTTGTCTTT    False
S13_TACCTATGTTCGGCAC    False
S13_CCTATTATCGTACCGG    False
                        ...  
S13_GTGTGTGAGACGACGT    False
S13_TGGGTGAAGCTAACAA    False
S13_AACTCATCAAGGTGTG    False
S13_TGCAGCCCATGCCACG    False
S13_GGGGTGAAGCTAACAA    False
Name: sum, Length: 339, dtype: bool

In [62]:
# save the list of prrx1+ bcs
print(len(all_bcs))
pd.Series(all_bcs).to_csv(out_path, sep='\n', index=False)

5929


# some other stuff

In [63]:
pd.Series(all_bcs)

0       S13_GTGTTAGTCTTCATGT
1       S13_GCATACAGTCGGCTCA
2       S13_TGAGAGGCAGCTGGCT
3       S13_CCTTCCCTCCAGAGGA
4       S13_CTCTACGTCATTCACT
                ...         
5924     S4_CACATTTTCTCCTATA
5925     S4_GCACTCTAGTGGACGT
5926     S4_TGGCGCAAGGTTACCT
5927     S4_AGGCCGTGTATAATGG
5928     S4_CTAGAGTTCGCCTGTT
Length: 5929, dtype: object

In [74]:
bcs_in_matrix = np.intersect1d(mtx.columns.values, all_bcs)
mtx.loc[:, bcs_in_matrix]

Unnamed: 0,S4_AAACCTGCAAAGGCGT,S4_AAACCTGCAGCTTAAC,S4_AAACCTGGTCAGTGGA,S4_AAACGGGCATTGGCGC,S4_AAACGGGTCTCCGGTT,S4_AAACGGGTCTCTAAGG,S4_AAACGGGTCTTGCCGT,S4_AAAGATGAGATGTCGG,S4_AAAGATGGTAATCGTC,S4_AAAGCAACACTGTTAG,...,S4_TTGGAACTCGTGGTCG,S4_TTGGAACTCTGTCCGT,S4_TTGGCAACACATCCAA,S4_TTTACTGCATGCTGGC,S4_TTTACTGGTTTACTCT,S4_TTTATGCAGAATGTGT,S4_TTTATGCCACACCGCA,S4_TTTGCGCAGCCAGAAC,S4_TTTGCGCCATCGATGT,S4_TTTGCGCTCATGCTCC
A1BG,0.0,0.000000,0.000000,2.553761,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
A1BG-AS1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.000000,1.993222,2.553761,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,1.880313,0.0,0.0,0.0,0.0,0.0,0.0
A2M-AS1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
A2ML1,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11A,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
ZYG11B,0.0,2.139625,1.427813,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
ZYX,0.0,1.557750,1.427813,1.935525,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
ZZEF1,0.0,1.557750,1.427813,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,2.059829,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
mtx.index.values

array(['A1BG', 'A1BG-AS1', 'A2M', ..., 'ZYX', 'ZZEF1', 'ZZZ3'],
      dtype=object)