In [107]:
### Clustering Data only for Scanorama

In [108]:
### TBD check out clusters + differences on complete data

# Load Libraries

## External libraries

In [109]:
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scanorama
import os
import multiprocessing
import random
import time
import git
import sys
from datetime import date

## Own functions

In [110]:
import MS1_Clustering

In [111]:
from MS1_Clustering import *

# Load Data

## Configurations

### Technical configurations

In [112]:
multiprocessing.cpu_count()  ### total amount of cpu on the used core

48

In [113]:
len(os.sched_getaffinity(0)) ### amount of available cpus

24

In [114]:
sc.settings.n_jobs   # default number of CPUs to use for parallel computing

1

In [115]:
sc.settings.max_memory  # maximum memory to use in GB

15

In [116]:
random.seed(0)

In [117]:
ncore = '24'

In [118]:
random_state_var = 0

In [119]:
os.environ["OMP_NUM_THREADS"] = ncore
os.environ["OPENBLAS_NUM_THREADS"] = ncore
os.environ["MKL_NUM_THREADS"] = ncore
os.environ["VECLIB_MAXIMUM_THREADS"] = ncore
os.environ["NUMEXPR_NUM_THREADS"] = ncore

In [120]:
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 0

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
anndata     0.7.6
scanpy      1.8.1
sinfo       0.3.4
-----
MS1_Clustering      NA
PIL                 8.3.1
annoy               NA
anyio               NA
attr                21.2.0
babel               2.9.1
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
brotli              NA
certifi             2021.05.30
cffi                1.14.6
chardet             4.0.0
charset_normalizer  2.0.0
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
decorator           

In [121]:
file_name = 'B2_Clustering_Scanorama.ipynb'

In [122]:
repo = git.Repo('stark-stemi')

### Parameters

In [123]:
data_path = '../data/current'

In [124]:
data_path

'../data/current'

In [125]:
variants = ['Scanorama']

## Meta-Data File

In [126]:
data_set_path = data_path + "/preprocessed-data/meta-data/Mapping_Hashtag_Sample.csv"
library_hashtag_mapping = pd.read_csv(data_set_path) 
print('Last modified' + time.ctime(os.path.getmtime(data_set_path)))

Last modifiedThu Sep  2 13:17:03 2021


In [127]:
#library_hashtag_mapping

## Combined data from B1

### Scanorama Data

In [128]:
data_name = '/analysis/B/B1_Integrated_Scanorama_processed_rna.h5ad'
#_backup_23_12_2022

In [129]:
anndata_dict = dict.fromkeys(variants)

In [130]:
for key in anndata_dict:
    dataset_path = data_path + '/' + data_name
    
    print(dataset_path)
    print('Last modified' + time.ctime(os.path.getmtime(dataset_path)))
    
    adata_orig = sc.read_h5ad(dataset_path)
  
    anndata_dict[key]= adata_orig # save anndata in dictionary
    anndata_dict[key].uns['data_load_time'] = time.ctime(os.path.getmtime(dataset_path))  # save the last modified timestamp of the data loaded
    anndata_dict[key].uns['data_load_name'] = dataset_path # save the data path of the data loaded 
    

../data/current//analysis/B/B1_Integrated_Scanorama_processed_rna.h5ad
Last modifiedSat Jan 21 00:19:44 2023


In [131]:
# anndata_dict  # cells have been filtered based on QC metrics; only 2000 highly variable genes that have been integrate

# Prepare data for clustering (different variants)

## Complete

In [132]:
anndata_dict_all = dict.fromkeys(variants)

In [133]:
for key in anndata_dict:
    anndata_dict_all[key] = anndata_dict[key]

In [134]:
anndata_dict_all

{'Scanorama': AnnData object with n_obs × n_vars = 289075 × 2000
     obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA_50_10_n

## Only singlets

In [135]:
anndata_dict_singlet = dict.fromkeys(variants)

In [136]:
for key in anndata_dict:
    print(key)
    anndata_dict_singlet[key] = anndata_dict[key][anndata_dict[key].obs['HTO_classification.global']=='Singlet']

Scanorama


In [137]:
anndata_dict_singlet

{'Scanorama': View of AnnData object with n_obs × n_vars = 148275 × 2000
     obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA

In [138]:
anndata_dict_singlet[key].X.sum(axis = 1)

matrix([[21.065617],
        [25.584118],
        [21.224699],
        ...,
        [22.754984],
        [29.788113],
        [23.377083]], dtype=float32)

## Only singlets - without ribosomal / mitochondrial RNA

In [139]:
anndata_dict_singlet_rb_mt = dict.fromkeys(variants)

In [140]:
for key in anndata_dict:
    print(key)
    anndata_dict_singlet_rb_mt[key] = anndata_dict[key][anndata_dict[key].obs['HTO_classification.global']=='Singlet']
    anndata_dict_singlet_rb_mt[key] = anndata_dict_singlet_rb_mt[key][:,np.logical_and(anndata_dict[key].var['rb'] == False , anndata_dict[key].var['mt'] == False)]

Scanorama


In [141]:
anndata_dict_singlet_rb_mt

{'Scanorama': View of AnnData object with n_obs × n_vars = 148275 × 2000
     obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA

In [142]:
anndata_dict_singlet_rb_mt[key].X.sum(axis = 1)

matrix([[21.065617],
        [25.584118],
        [21.224699],
        ...,
        [22.754984],
        [29.788113],
        [23.377083]], dtype=float32)

## Only singlets - without ribosomal / mitochondrial RNA - stronger QC

In [143]:
anndata_dict_singlet_qc_rb_mt = dict.fromkeys(variants)

In [144]:
for key in anndata_dict:
    print(key)
    anndata_dict_singlet_qc_rb_mt[key] = anndata_dict[key][anndata_dict[key].obs['HTO_classification.global']=='Singlet']
    anndata_dict_singlet_qc_rb_mt[key] = anndata_dict_singlet_qc_rb_mt[key][anndata_dict_singlet_qc_rb_mt[key].obs['pct_counts_mt'] < 10]
    anndata_dict_singlet_qc_rb_mt[key] = anndata_dict_singlet_qc_rb_mt[key][:,np.logical_and(anndata_dict_singlet_qc_rb_mt[key].var['rb'] == False , anndata_dict_singlet_qc_rb_mt[key].var['mt'] == False)]

Scanorama


In [145]:
anndata_dict_singlet_qc_rb_mt

{'Scanorama': View of AnnData object with n_obs × n_vars = 143033 × 2000
     obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA

# Clustering and UMAP

## Complete data

In [146]:
anndata_result = MS1_Clustering.neighbors_and_cluster( 'B2_Scanorama' ,anndata_dict, anndata_dict_all, use_rep_var = "X_scanorama" , random_state_var = random_state_var, n_neighbors_var =10, n_pcs_var = 50)

2023-01-21 09:22:42.617006: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-21 09:22:42.617062: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Calculated neighborhood graph
Clustered cells
ScanoramaB2_Scanorama variant finished


In [147]:
### Store the result in anndata_dict

In [148]:
anndata_dict = anndata_result[0]

## Only singlets

In [149]:
anndata_result = MS1_Clustering.neighbors_and_cluster( 'B2_Scanorama_Singlet' ,anndata_dict_singlet, anndata_dict_singlet, use_rep_var = "X_scanorama" , random_state_var = random_state_var, n_neighbors_var =10, n_pcs_var = 50)

Calculated neighborhood graph
Clustered cells
ScanoramaB2_Scanorama_Singlet variant finished


In [150]:
### Store the result in anndata_dict

In [151]:
anndata_dict_singlet = anndata_result[0]

## Singlets - without ribosomal / mitochondrial RNA

In [152]:
anndata_result = MS1_Clustering.neighbors_and_cluster( 'B2_Scanorama_Singlet_rb_mt' ,anndata_dict_singlet, anndata_dict_singlet_rb_mt, use_rep_var = 'X_scanorama_rb_mt', random_state_var = random_state_var, n_neighbors_var =10, n_pcs_var = 50)

Calculated neighborhood graph
Clustered cells
ScanoramaB2_Scanorama_Singlet_rb_mt variant finished


In [153]:
### Store the result in anndata_dict

In [154]:
anndata_dict_singlet = anndata_result[0]

## Singlets - without ribosomal / mitochondrial RNA - stronger QC

In [155]:
anndata_result = MS1_Clustering.neighbors_and_cluster( 'B2_Scanorama_Singlet_qc_rb_mt' ,anndata_dict_singlet_qc_rb_mt, anndata_dict_singlet_qc_rb_mt, use_rep_var = 'X_scanorama_rb_mt', random_state_var = random_state_var, n_neighbors_var =10, n_pcs_var = 50)

Calculated neighborhood graph
Clustered cells
ScanoramaB2_Scanorama_Singlet_qc_rb_mt variant finished


In [156]:
### Store the result in anndata_dict

In [157]:
anndata_dict_singlet_qc_rb_mt = anndata_result[0]

# Saving

In [158]:
anndata_dict_singlet

{'Scanorama': AnnData object with n_obs × n_vars = 148275 × 2000
     obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA_50_10_n

## Combine both datasets

In [159]:
for key in anndata_dict:
    print(key)
    anndata_dict[key].obs = pd.merge(anndata_dict[key].obs ,anndata_dict_singlet[key].obs[['B2_Scanorama_Singlet_cluster', 'B2_Scanorama_Singlet_rb_mt_cluster']], how='left', left_index = True, right_index = True)
## TBD: add keys of added clusters

Scanorama


In [160]:
anndata_dict

{'Scanorama': AnnData object with n_obs × n_vars = 289075 × 2000
     obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet', 'A7_PCA_50_10_neighbors_cluster', 'A7_PCA_100_10_neighbors_cluster', 'A7_PCA_100_50_neighbors_cluster', 'A7_PCA_500_50_neighbors_cluster', 'A7_Singlet_PCA_50_10_n

## Save data

In [161]:
for key in anndata_dict:
    ##### Complete data
    
    data_name = data_path +  '/analysis/B/B2_Integrated_Scanorama_processed_rna.h5ad'
    
    print(key)
    print(data_name)
    print('Last modified' + date.today().strftime("%m/%d/%Y, %H:%M:%S"))
    
    anndata_dict[key].uns['data_save_time'] = date.today().strftime("%m/%d/%Y, %H:%M:%S") # save the last modified timestamp of the data saved
    anndata_dict[key].uns['data_save_name'] = data_name # save the data path of the data loaded 
    
    
    working_directory = os.getcwd()
    repo.index.add([working_directory + '/' + file_name])
    commit = repo.index.commit('Save data '+ data_name + ' '+ date.today().strftime("%m/%d/%Y, %H:%M:%S") )
    print(commit)
    
    anndata_dict[key].write(data_name)
    
    ##### Singlet data
    
    data_name = data_path + '/analysis/B/B2_Integrated_Scanorama_Singlet_processed_rna.h5ad'
    
    
    anndata_dict_singlet[key].uns['data_save_time'] = date.today().strftime("%m/%d/%Y, %H:%M:%S") # save the last modified timestamp of the data saved
    anndata_dict_singlet[key].uns['data_save_name'] = data_name # save the data path of the data loaded 
    
    anndata_dict_singlet[key].write(data_name)
    
    
    anndata_dict_singlet_qc_rb_mt[key].write( data_path + '/analysis/B/B2_Integrated_Scanorama_Singlet_QC_processed_rna.h5ad')

Scanorama
../data/current/analysis/B/B2_Integrated_Scanorama_processed_rna.h5ad
Last modified01/21/2023, 00:00:00
701facdcaf664e5a5b0599ffcf4d1d129f2b6264
