# Code to implement ANN for Zheng 68k dataset

(10-22-2020)
- implementing ANN with Tensorflow/Keras instead of PyTorch

In [4]:
import numpy as np
import pandas as pd
import scipy.io as io
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# path = path to the 10Xv2 data directory containing genes.tsv, barcodes.tsv, and matrix.mtx files
# Note: need to import cell type annotations file manually since annotation files do not 
# have any set format between datasets. Set index names to cell barcode IDs in the annotation dataframe
# and have the celltype annotations as the first column.

class sctype:
    def __init__(self, path):
        self.genes = pd.read_csv(path + 'genes.tsv', sep = '\t', header = None).iloc[:,1]
        self.barcodes = pd.read_csv(path + 'barcodes.tsv', sep = '\t', header = None).iloc[:,0]
        expression = io.mmread(path + 'matrix.mtx') 
        data = pd.DataFrame.sparse.from_spmatrix(data = expression, index = self.genes, columns = self.barcodes)
        self.data = data.transpose()
    
#     labels input should be a dataframe with cell barcode IDs as indices
#     and celltype annotations in the first column.
    def split(self, labels, test_size, random_state):
        train_data, test_data, self.train_labels, self.test_labels = train_test_split(
            self.data, labels, test_size=test_size, random_state=random_state)
        self.train_data = train_data.fillna(0)
        self.test_data = test_data.fillna(0)
    
    def ann_preprocess(self, data=None, target=None):
        ann_data = data
        factored_labels = pd.factorize(target.iloc[:,0])
        ann_target = to_categorical(factored_labels[0], len(factored_labels[1]))
        return ann_data, ann_target
    
    def ann(self, epochs, batch_size):
        ann_data, ann_target = self.ann_preprocess(data=self.train_data, target=self.train_labels)
        
        model = keras.Sequential()
        model.add(layers.Dense(200, activation='relu', name = 'layer1'))
        model.add(layers.Dense(11, activation='softmax', name = 'outputlayer'))
        
        model.compile(optimizer='adam',
                     loss='categorical_crossentropy',
                     metrics=['accuracy'])
        
        model.fit(ann_data, ann_target, epochs=epochs, batch_size=batch_size)
        self.ann = model
        return self.ann

In [5]:
path_data = '/Users/leealj/py_projects/biof509_final/zheng68k/filtered_matrices_mex/hg19/'
path_anno = '/Users/leealj/py_projects/biof509_final/zheng68k/'
labels = pd.read_csv(path_anno + '68k_pbmc_barcodes_annotation.tsv', sep = '\t').loc[:,["barcodes","celltype"]]
labels.set_index("barcodes", inplace = True)

In [6]:
zheng = sctype(path_data)

In [7]:
zheng.split(labels, 0.25, random_state = 40)

In [14]:
# train the NN with the training dataset from split
zheng.ann(5, 500) 

Epoch 1/5


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.engine.sequential.Sequential at 0x7fe4fd401df0>

In [15]:
# Use test dataset from split to test the NN
test_data, test_labels = zheng.ann_preprocess(data=zheng.test_data, target=zheng.test_labels)
score = zheng.ann.evaluate(test_data, test_labels, verbose=1)
print('test loss=', score[0]) 
print('test accuracy=', score[1]) 

test loss= 5.436718463897705
test accuracy= 0.23108777403831482


In [16]:
# Use train dataset from split to test the NN
train_data, train_labels = zheng.ann_preprocess(data=zheng.train_data, target=zheng.train_labels)
score_train = zheng.ann.evaluate(train_data, train_labels, verbose=1)
print('training loss=', score_train[0]) 
print('training accuracy=', score_train[1]) 

training loss= 0.21806104481220245
training accuracy= 0.9465528726577759
