# Code for sctype class (10-15-2020)
- initializing the sctype class imports 10Xv2 data with genes.tsv, barcodes.tsv, and matrix.mtx files
- much provide path to the directory with these three files.
- Need to import cell type annotations manually with pd.read_csv()


In [41]:
import pandas as pd
import scipy.io as io
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# path = path to the 10Xv2 data directory containing genes.tsv, barcodes.tsv, and matrix.mtx files
# Note: need to import cell type annotations file manually since annotation files do not 
# have any set format between datasets. After initializing the sctype object and splitting 
# train/test data, split the annotation file based on the barcodes from the train/test data.

class sctype:
    def __init__(self, path):
        genes = pd.read_csv(path + 'genes.tsv', sep = '\t', header = None)
        barcodes = pd.read_csv(path + 'barcodes.tsv', sep = '\t', header = None)
        expression = io.mmread(path + 'matrix.mtx') 
        self.data = pd.DataFrame.sparse.from_spmatrix(data = expression, index = genes, columns = barcodes)

    def split(self, test_size, random_state):
        self.train_data, self.test_data = train_test_split(
            self.data, test_size=test_size, random_state=random_state)
        
    def svm(self, annotations, kern='rbf'):
        self.svmfit = SVC(kernel=kern).fit(self.data, annotations)
        return self.svmfit
    

Define paths for Zheng 68k PBMC dataset/annotations, and import annotations:

In [56]:
path_data = '/Users/leealj/py_projects/biof509_final/zheng68k/filtered_matrices_mex/hg19/'
path_anno = '/Users/leealj/py_projects/biof509_final/zheng68k/'
labels = pd.read_csv(path_anno + '68k_pbmc_barcodes_annotation.tsv', sep = '\t').loc[:,["barcodes","celltype"]]
labels.set_index("barcodes", inplace = True)
labels

Unnamed: 0_level_0,celltype
barcodes,Unnamed: 1_level_1
AAACATACACCCAA-1,CD8+ Cytotoxic T
AAACATACCCCTCA-1,CD8+/CD45RA+ Naive Cytotoxic
AAACATACCGGAGA-1,CD4+/CD45RO+ Memory
AAACATACTAACCG-1,CD19+ B
AAACATACTCTTCA-1,CD4+/CD25 T Reg
...,...
TTTGCATGAGCCTA-8,CD8+ Cytotoxic T
TTTGCATGCTAGCA-8,CD8+/CD45RA+ Naive Cytotoxic
TTTGCATGCTGCAA-8,CD8+ Cytotoxic T
TTTGCATGGCTCCT-8,CD8+ Cytotoxic T


Create the sctype class and check data

In [44]:
zheng = sctype(path_data)
zheng.data.head()

Unnamed: 0,"(AAACATACACCCAA-1,)","(AAACATACCCCTCA-1,)","(AAACATACCGGAGA-1,)","(AAACATACTAACCG-1,)","(AAACATACTCTTCA-1,)","(AAACATACTGGATC-1,)","(AAACATACTGTCTT-1,)","(AAACATACTTCTAC-1,)","(AAACATTGCTGCTC-1,)","(AAACATTGCTTCGC-1,)",...,"(TTTGACTGCTTTAC-8,)","(TTTGACTGTATCGG-8,)","(TTTGACTGTCGTTT-8,)","(TTTGACTGTGCTAG-8,)","(TTTGCATGACACCA-8,)","(TTTGCATGAGCCTA-8,)","(TTTGCATGCTAGCA-8,)","(TTTGCATGCTGCAA-8,)","(TTTGCATGGCTCCT-8,)","(TTTGCATGTGGTAC-8,)"
"(ENSG00000243485, MIR1302-10)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(ENSG00000237613, FAM138A)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(ENSG00000186092, OR4F5)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(ENSG00000238009, RP11-34P13.7)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(ENSG00000239945, RP11-34P13.8)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
zheng.split(test_size = 0.25, random_state = 40)

In [59]:
train_cells = zheng.train_data.columns
train_cells

Index([('AAACATACACCCAA-1',), ('AAACATACCCCTCA-1',), ('AAACATACCGGAGA-1',),
       ('AAACATACTAACCG-1',), ('AAACATACTCTTCA-1',), ('AAACATACTGGATC-1',),
       ('AAACATACTGTCTT-1',), ('AAACATACTTCTAC-1',), ('AAACATTGCTGCTC-1',),
       ('AAACATTGCTTCGC-1',),
       ...
       ('TTTGACTGCTTTAC-8',), ('TTTGACTGTATCGG-8',), ('TTTGACTGTCGTTT-8',),
       ('TTTGACTGTGCTAG-8',), ('TTTGCATGACACCA-8',), ('TTTGCATGAGCCTA-8',),
       ('TTTGCATGCTAGCA-8',), ('TTTGCATGCTGCAA-8',), ('TTTGCATGGCTCCT-8',),
       ('TTTGCATGTGGTAC-8',)],
      dtype='object', length=68579)

In [66]:
zheng.data.columns[1]

('AAACATACCCCTCA-1',)