# Code for sctype class 
### (10-15-2020)
- initializing the sctype class imports 10Xv2 data with genes.tsv, barcodes.tsv, and matrix.mtx files
- must provide path to the directory with these three files.
- Need to import cell type annotations manually with pd.read_csv()

### (10-19-2020)
- SVM implemented with LinearSVC 
    - SVC was giving me "AttributeError: var not found"
    - Not sure how to resolve this error, but documentation recommends LinearSVC for large datasets anyways
- changed the output of train_test_split() to split all train/test data as well as labels (4 outputs)

### (10-20-2020)
- Added CalibratedClassifierCV() wrapper to model to calibrate probabilities
- Use predict_proba() method to get probabilities for each class

In [6]:
import numpy as np
import pandas as pd
import scipy.io as io
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score


# path = path to the 10Xv2 data directory containing genes.tsv, barcodes.tsv, and matrix.mtx files
# Note: need to import cell type annotations file manually since annotation files do not 
# have any set format between datasets. Set index names to cell barcode IDs in the annotation dataframe
# and have the celltype annotations as the first column.

class sctype:
    def __init__(self, path):
        self.genes = pd.read_csv(path + 'genes.tsv', sep = '\t', header = None).iloc[:,1]
        self.barcodes = pd.read_csv(path + 'barcodes.tsv', sep = '\t', header = None).iloc[:,0]
        expression = io.mmread(path + 'matrix.mtx') 
        data = pd.DataFrame.sparse.from_spmatrix(data = expression, index = self.genes, columns = self.barcodes)
        self.data = data.transpose()
    
#     labels input should be a dataframe with cell barcode IDs as indices
#     and celltype annotations in the first column.
    def split(self, labels, test_size, random_state):
        self.train_data, self.test_data, self.train_labels, self.test_labels = train_test_split(
            self.data, labels, test_size=test_size, random_state=random_state)

    def svm(self, iterations = 1000, cv = 4, method = 'sigmoid'):
        clf = LinearSVC(max_iter = iterations)
        clf = CalibratedClassifierCV(clf, cv = cv, method = method)
        self.svmfit = clf.fit(self.train_data, self.train_labels.iloc[:,0])
        return self.svmfit

Define paths for Zheng 68k PBMC dataset/annotations, and import annotations:

In [2]:
path_data = '/Users/leealj/py_projects/biof509_final/zheng68k/filtered_matrices_mex/hg19/'
path_anno = '/Users/leealj/py_projects/biof509_final/zheng68k/'
labels = pd.read_csv(path_anno + '68k_pbmc_barcodes_annotation.tsv', sep = '\t').loc[:,["barcodes","celltype"]]
labels.set_index("barcodes", inplace = True)

Create the sctype class and check data

In [3]:
zheng = sctype(path_data)
zheng.data.head()

Unnamed: 0_level_0,MIR1302-10,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,AL627309.1,RP11-34P13.14,RP11-34P13.9,AP006222.2,RP4-669L17.10,...,BX072566.1,AL354822.1,KIR2DL2,AL590523.1,CT476828.1,AC145205.1,BAGE5,CU459201.1,AC002321.2,AC002321.1
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACATACACCCAA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACCCCTCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACCGGAGA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACTAACCG-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAACATACTCTTCA-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
zheng.split(labels = labels, test_size = 0.33, random_state = 40)

In [7]:
model = zheng.svm()



In [8]:
predictions_train = model.predict(zheng.train_data)
predictions_test = model.predict(zheng.test_data)

In [9]:
print("The prediction accuracy on the train set is: {}"\
      .format(sum(predictions_train == zheng.train_labels.iloc[:,0])/len(predictions_train)))
print("The prediction accuracy on the test set is: {}"\
      .format(sum(predictions_test == zheng.test_labels.iloc[:,0])/len(predictions_test)))

The prediction accuracy on the train set is: 0.9136178640607656
The prediction accuracy on the test set is: 0.7101891127606929


In [None]:
# Use cross validation
# cv_scores = []
# for model, params in cv_svms:
#     cv_scores.append(cv_demo.cross_val(model, cv_demo.data, 5))

In [12]:
# probs = model.decision_function(zheng.test_data)
# probs = (probs-probs.min())/(probs.max()-probs.min())
# probs
probs = model.predict_proba(zheng.test_data)

In [30]:
np.unique(zheng.test_labels)

array(['CD14+ Monocyte', 'CD19+ B', 'CD34+', 'CD4+ T Helper2',
       'CD4+/CD25 T Reg', 'CD4+/CD45RA+/CD25- Naive T',
       'CD4+/CD45RO+ Memory', 'CD56+ NK', 'CD8+ Cytotoxic T',
       'CD8+/CD45RA+ Naive Cytotoxic', 'Dendritic'], dtype=object)