# Code for sctype class (10-15-2020)
- initializing the sctype class imports 10Xv2 data with genes.tsv, barcodes.tsv, and matrix.mtx files
- much provide path to the directory with these three files.
- Need to import cell type annotations manually with pd.read_csv()


In [1]:
import numpy as np
import pandas as pd
import scipy.io as io
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# path = path to the 10Xv2 data directory containing genes.tsv, barcodes.tsv, and matrix.mtx files
# Note: need to import cell type annotations file manually since annotation files do not 
# have any set format between datasets. After initializing the sctype object and splitting 
# train/test data, split the annotation file based on the barcodes from the train/test data.

class sctype:
    def __init__(self, path):
#         self.genes = pd.read_csv(path + 'genes.tsv', sep = '\t', header = None).iloc[:,1].to_numpy()
#         self.barcodes = pd.read_csv(path + 'barcodes.tsv', sep = '\t', header = None).to_numpy()
        self.genes = pd.read_csv(path + 'genes.tsv', sep = '\t', header = None).iloc[:,1]
        self.barcodes = pd.read_csv(path + 'barcodes.tsv', sep = '\t', header = None).iloc[:,0]
        expression = io.mmread(path + 'matrix.mtx') 
        data = pd.DataFrame.sparse.from_spmatrix(data = expression, index = self.genes, columns = self.barcodes)
        self.data = data.transpose()

    def split(self, test_size, random_state):
        self.train_data, self.test_data = train_test_split(
            self.data, test_size=test_size, random_state=random_state)
        
    def svm(self, annotations, kern='rbf'):
        self.svmfit = SVC(kernel=kern).fit(self.data, annotations)
        return self.svmfit
    

Define paths for Zheng 68k PBMC dataset/annotations, and import annotations:

In [2]:
path_data = '/Users/leealj/py_projects/biof509_final/zheng68k/filtered_matrices_mex/hg19/'
path_anno = '/Users/leealj/py_projects/biof509_final/zheng68k/'
labels = pd.read_csv(path_anno + '68k_pbmc_barcodes_annotation.tsv', sep = '\t').loc[:,["barcodes","celltype"]]
labels.set_index("barcodes", inplace = True)
labels

Unnamed: 0_level_0,celltype
barcodes,Unnamed: 1_level_1
AAACATACACCCAA-1,CD8+ Cytotoxic T
AAACATACCCCTCA-1,CD8+/CD45RA+ Naive Cytotoxic
AAACATACCGGAGA-1,CD4+/CD45RO+ Memory
AAACATACTAACCG-1,CD19+ B
AAACATACTCTTCA-1,CD4+/CD25 T Reg
...,...
TTTGCATGAGCCTA-8,CD8+ Cytotoxic T
TTTGCATGCTAGCA-8,CD8+/CD45RA+ Naive Cytotoxic
TTTGCATGCTGCAA-8,CD8+ Cytotoxic T
TTTGCATGGCTCCT-8,CD8+ Cytotoxic T


Create the sctype class and check data

In [3]:
zheng = sctype(path_data)
zheng.data.head()

Unnamed: 0_level_0,MIR1302-10,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,AL627309.1,RP11-34P13.14,RP11-34P13.9,AP006222.2,RP4-669L17.10,...,BX072566.1,AL354822.1,KIR2DL2,AL590523.1,CT476828.1,AC145205.1,BAGE5,CU459201.1,AC002321.2,AC002321.1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCCAA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACCCCTCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACCGGAGA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACTAACCG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACTCTTCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
zheng.split(test_size = 0.25, random_state = 40)

In [5]:
train_cells = zheng.train_data.index
train_cells

Index(['GTCGCACTCTGATG-6', 'CAAGCCCTTCTACT-2', 'CATGGATGTCTGGA-6',
       'TTCGAGGAATTTCC-6', 'ATGTTCACCTACGA-6', 'TAGGCTGATGCCCT-1',
       'TAAAGACTACGCAT-4', 'AAAGTTTGCTAGCA-8', 'ATTGATGATACTGG-6',
       'GGACAGGATTTACC-4',
       ...
       'AAATCCCTTGTGAC-8', 'GTTCAACTAAGAAC-6', 'ACTTGGGATCTTCA-2',
       'TCAGCGCTCCTTAT-1', 'GCCTGACTCAGCTA-1', 'ATTGCACTATTTCC-2',
       'CACGATGATGTAGC-7', 'GACGCCGAGGGACA-7', 'GATATATGCAGAAA-4',
       'CAAAGCTGTGAAGA-2'],
      dtype='object', name=0, length=51434)

In [6]:
train_labels = labels.loc[train_cells]
train_labels

Unnamed: 0_level_0,celltype
0,Unnamed: 1_level_1
GTCGCACTCTGATG-6,CD8+ Cytotoxic T
CAAGCCCTTCTACT-2,CD56+ NK
CATGGATGTCTGGA-6,CD4+/CD25 T Reg
TTCGAGGAATTTCC-6,CD14+ Monocyte
ATGTTCACCTACGA-6,CD56+ NK
...,...
ATTGCACTATTTCC-2,CD8+ Cytotoxic T
CACGATGATGTAGC-7,CD8+ Cytotoxic T
GACGCCGAGGGACA-7,CD56+ NK
GATATATGCAGAAA-4,CD56+ NK
