In [1]:
from google.colab import drive 
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
pip install scanpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scanpy
  Downloading scanpy-1.9.1-py3-none-any.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 5.5 MB/s 
Collecting session-info
  Downloading session_info-1.0.0.tar.gz (24 kB)
Collecting umap-learn>=0.3.10
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 7.8 MB/s 
Collecting anndata>=0.7.4
  Downloading anndata-0.8.0-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 5.8 MB/s 
Collecting matplotlib>=3.4
  Downloading matplotlib-3.5.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (11.2 MB)
[K     |████████████████████████████████| 11.2 MB 45.0 MB/s 
Collecting fonttools>=4.22.0
  Downloading fonttools-4.37.4-py3-none-any.whl (960 kB)
[K     |████████████████████████████████| 960 kB 51.3 MB/s 
Collecting pynndescent>=0.5
  Downloading pynndescent-0.5.7.tar.gz (1.1 MB)
[K     |█████████

In [3]:
import os
import numpy as np
import pandas as pd
import scipy
from scipy.sparse import issparse
import scanpy as sc


from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

In [4]:
# Read data

adata = sc.read_h5ad('/content/drive/MyDrive/USYD/2021 S1/INFO4001/2022-re/Leukemia.h5ad')




if not issparse(adata.X):
    adata.X = scipy.sparse.csr_matrix(adata.X)


adata.var_names_make_unique()


In [5]:
#Preprocessing 

print('Raw dataset shape: {}'.format(adata.shape))

if not issparse(adata.X):
    adata.X = scipy.sparse.csr_matrix(adata.X)

adata.X[adata.X>1] = 1

# Filtering  data
'''
filter low quality peaks by valid cells number, default 0.01
filter low quality cells by valid peaks number, default 100
filter peaks by selecting highly variable features, default 100,000
''' 
min_cells = 0.01
min_genes = 100
n_features = 100000


#Filtering  cells 
sc.pp.filter_cells(adata, min_genes=min_genes)


#Filtering  genes
if min_cells <1:
    min_cells = min_cells * adata.shape[0]
sc.pp.filter_genes(adata, min_cells=min_cells)

sc.pp.highly_variable_genes(adata, n_top_genes=n_features, inplace=False, subset=True)

print('Processed dataset shape: {}'.format(adata.shape))

Raw dataset shape: (391, 7602)
Processed dataset shape: (352, 7537)


In [6]:
df = adata.to_df()

In [7]:
# processing y
index = df.index
mt_y = index.to_numpy()
for i in range(len(mt_y)):
    mt_y[i] = "_".join(mt_y[i].split("_")[:-1])

In [8]:
# split data to train and test 
mt = df.to_numpy()

x_train, x_test, y_train, y_test = train_test_split(
    mt, mt_y, test_size=0.2,stratify=mt_y, random_state=42,shuffle=True)

unique_classes = np.unique(mt_y)
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)
print(unique_classes)
print(len(unique_classes))

(281, 7537) (71, 7537) (281,) (71,)
['BM1077_LMPP_Frozen_160107' 'singles_PB1022_mono_160128'
 'singles_SU070_140806_Leuk' 'singles_SU070_LSC_141210'
 'singles_SU353_Blast_160205' 'singles_SU353_LSC_160209']
6


In [9]:
label = unique_classes.tolist()


n_lst=[]
for i in range(len(y_train)):
  for j in range(len(label)):
    if y_train[i] == label[j]:

      n_lst.append(j)

y_new_train = np.array(n_lst,dtype='float64')

n_lst=[]
for i in range(len(y_test)):
  for j in range(len(label)):
    if y_test[i] == label[j]:
      n_lst.append(j)

y_new_test = np.array(n_lst,dtype='float64')

In [10]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(x_train,y_new_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [11]:
y_pred = clf.predict(x_test)

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

out_acc = accuracy_score(y_new_test, y_pred)
print("Raw data Accuracy: {}".format(round(out_acc,4)))

out_f1 = f1_score(y_new_test, y_pred,average ='micro')
print("Raw data F1 Score(micro): {}".format(round(out_f1,4)))

out_f1 = f1_score(y_new_test, y_pred,average ='macro')
print("Raw data F1 Score(macro): {}".format(round(out_f1,4)))

out_f1 = f1_score(y_new_test, y_pred,average ='weighted')
print("Raw data F1 Score(weighted): {}".format(round(out_f1,4)))

Raw data Accuracy: 0.4789
Raw data F1 Score(micro): 0.4789
Raw data F1 Score(macro): 0.2433
Raw data F1 Score(weighted): 0.3349
