In [1]:
## Script for integrating the seperate processed datasets with scanorama


# Load Libraries

In [None]:
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scanorama
import os
import multiprocessing
import random
import time
import git
import sys
from datetime import date
from datetime import datetime

In [None]:
#### Check whether multi-processing works
size = 10000
a = np.random.random_sample((size, size))
b = np.random.random_sample((size, size))
n = np.dot(a,b)
### this test works --> multiprocessing for numpy does not seem to be a problem

# Load Data

## Configurations

### Technical configurations

In [None]:
multiprocessing.cpu_count()  ### total amount of cpu on the used core

In [None]:
len(os.sched_getaffinity(0)) ### amount of available cpus

In [None]:
sc.settings.n_jobs   # default number of CPUs to use for parallel computing

In [None]:
sc.settings.max_memory  # maximum memory to use in GB

In [None]:
random.seed(7)

In [None]:
ncore = '24'

In [None]:
random_state_var = 0

In [None]:
os.environ["OMP_NUM_THREADS"] = ncore
os.environ["OPENBLAS_NUM_THREADS"] = ncore
os.environ["MKL_NUM_THREADS"] = ncore
os.environ["VECLIB_MAXIMUM_THREADS"] = ncore
os.environ["NUMEXPR_NUM_THREADS"] = ncore

In [None]:
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 0

In [None]:
file_name = 'B1_Data_Integration_Scanorama.ipynb'

In [None]:
repo = git.Repo('stark-stemi')

### Parameters

In [None]:
data_path = '../data/current'

In [None]:
data_path

In [None]:
# libraries =['0001', '0002']
libraries =['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008', '0009', '0010', '0011', '0012', '0013', '0014'] # reduced to only 4 libraries for testing # reduced to only 4 libraries for testing

In [None]:
# libraries_text = ['L1', 'L2']
libraries_text = ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10', 'L11', 'L12', 'L13', 'L14']

In [None]:
libraries_dict = {'0001': 'L1', '0002': 'L2', '0003': 'L3', '0004': 'L4', '0005': 'L5',  '0006': 'L6', '0007': 'L7', '0008': 'L8', '0009': 'L9', '0010': 'L10', '0011': 'L11', '0012': 'L12', '0013': 'L13', '0014': 'L14'}

In [None]:
#libraries_dict = {'0001': 'L1', '0002': 'L2'}

In [None]:
save_name = 'B1_DE_Integrated_Singlet_processed'

## RNA Data from A7

In [None]:
anndata_dict = dict.fromkeys(libraries_text)

In [None]:
anndata_dict_raw = dict.fromkeys(libraries_text)   # get a dictionary containing the original raw data 

In [None]:
for key in anndata_dict:
    dataset_path = data_path + "/analysis/A/A7_Processed_" + key + '_rna_Final.h5ad'
    
    print(dataset_path)
    print('Last modified' + time.ctime(os.path.getmtime(dataset_path)))
    
    adata_orig = sc.read_h5ad(dataset_path)
    anndata_dict[key]= adata_orig # save anndata in dictionary
    anndata_dict_raw[key] = anndata_dict[key].raw.to_adata()
    
    anndata_dict[key].uns['data_load_time'] = time.ctime(os.path.getmtime(dataset_path))  # save the last modified timestamp of the data loaded
    anndata_dict[key].uns['data_load_name'] = dataset_path # save the data path of the data loaded 
    

In [None]:
### Short data check

In [None]:
anndata_dict[key]  # cells have been filtered based on QC metrics

In [None]:
anndata_dict[key].X.sum(axis=1) # data was normalized and log-transformed


In [None]:
anndata_dict_raw[key].X.sum(axis=1) # original raw count data

# Data Integration

## Put all in one anndata list

In [None]:
anndata = list(anndata_dict.values())

## Correct counts with sanorama on complete data

In [None]:
#### Use list containing all libraries
# anndata

In [None]:
print(datetime.now())  # check out duration

In [None]:
adatas_cor = scanorama.correct_scanpy(anndata, return_dimred=True, batch_size = 2000, hvg = 2000)


In [None]:
print(datetime.now())

In [None]:
adatas_cor

In [None]:
adatas_cor[0].uns

In [None]:
adata_complete_cor = adatas_cor[0].concatenate(
    adatas_cor[1:14],
    batch_key="library_id",
    batch_categories = anndata_dict.keys(),
    uns_merge="unique"
)

In [None]:
#adata_complete_cor

In [None]:
#adata_complete_cor.X.sum(axis=1)

## Correct counts with sanorama on complete data (without rb + mt genes)

In [None]:
anndata_dict_rb_mt = dict.fromkeys(libraries_text)

In [None]:
for key in anndata_dict:
    print(key)
    # anndata_dict_singlet_rb_mt[key] = anndata_dict[key][anndata_dict[key].obs['HTO_classification.global']=='Singlet']
    
    anndata_dict_rb_mt[key] = anndata_dict[key][:,np.logical_and(anndata_dict[key].var['rb'] == False , anndata_dict[key].var['mt'] == False)]
    
    if save_name == 'B6_DE_Integrated_Singlet_processed_rna_MOFA':
        anndata_dict_rb_mt[key] = anndata_dict_rb_mt[key][:, genes_filter]  # ONLY FOR COMPARISON TO MOFA remove again --> FILTER ON SELECTED GENES

In [None]:
anndata_dict_rb_mt[key]

In [None]:
anndata_rb_mt = list(anndata_dict_rb_mt.values())

In [None]:
anndata_rb_mt[1]

In [None]:
print(datetime.now())  # check out duration

In [None]:
save_name != 'B6_DE_Integrated_Singlet_processed_rna_MOFA'

In [None]:
if save_name != 'B6_DE_Integrated_Singlet_processed_rna_MOFA':
    adatas_cor_rb_mt = scanorama.correct_scanpy(anndata_rb_mt, return_dimred=True, batch_size = 2000, hvg = 2000)


In [None]:
 print(datetime.now())

In [None]:
# adatas_cor

In [None]:
# adatas_cor[0].uns

In [None]:
adata_complete_cor_rb_mt = adatas_cor_rb_mt[0].concatenate(
    adatas_cor_rb_mt[1:14],
    batch_key="library_id",
    batch_categories = anndata_dict.keys(),
    uns_merge="unique"
)

In [None]:
adata_complete_cor_rb_mt

In [None]:
adata_complete_cor_rb_mt[adata_complete_cor_rb_mt.obs['HTO_classification.global']=='Singlet'].X.sum(axis=1)

In [None]:
adata_complete_cor_rb_mt[adata_complete_cor_rb_mt.obs['HTO_classification.global']=='Singlet'].obsm['X_scanorama']  # alignment confirmed with B3

In [None]:
adata_complete_cor_rb_mt.obsm['X_scanorama']

In [None]:
adata_complete_cor_rb_mt.X.sum(axis=1)

In [None]:
adata_complete_cor_rb_mt.obsm['X_scanorama'].sum(axis=1)

In [None]:
np.shape(adata_complete_cor_rb_mt.X)

In [None]:
np.shape(adata_complete_cor_rb_mt.obsm['X_scanorama'])

# Save the result

## Combine different variants

In [None]:
#adata_complete_cor.obsm['X_scanorama_rb_mt'] = adata_complete_cor_rb_mt.obsm['X_scanorama']

In [None]:
#adata_complete_cor

In [None]:
### only for NEW:
adata_complete_cor_rb_mt.obsm['X_scanorama_rb_mt'] = adata_complete_cor_rb_mt.obsm['X_scanorama']

## Do some data format adaptions

In [None]:
### convert column to right format

In [None]:
#adata_complete_cor.obs['A5_scrublet_predicted_doublet_lib0.2'] =adata_complete_cor.obs['A5_scrublet_predicted_doublet_lib0.2'].apply(str)

In [None]:
### only for NEW:

adata_complete_cor_rb_mt.obs['A5_scrublet_predicted_doublet_lib0.2'] =adata_complete_cor_rb_mt.obs['A5_scrublet_predicted_doublet_lib0.2'].apply(str)

## Save

In [None]:
data_name = data_path + '/analysis/B/' +  'B1_Integrated_Scanorama_processed_rna.h5ad'

print(data_name)
print('Last modified' + date.today().strftime("%m/%d/%Y, %H:%M:%S"))


adata_complete_cor_rb_mt.uns['data_save_time'] = date.today().strftime("%m/%d/%Y, %H:%M:%S") # save the last modified timestamp of the data saved
adata_complete_cor_rb_mt.uns['data_save_name'] = data_name # save the data path of the data loaded 


working_directory = os.getcwd()
repo.index.add([working_directory + '/' + file_name])
commit = repo.index.commit('Save data '+ data_name + ' '+ date.today().strftime("%m/%d/%Y, %H:%M:%S") )
print(commit)


adata_complete_cor_rb_mt.write(data_name)