In [1]:
#############################
## Prepare data for AZIMUTH annotation
## Annotates cells for each library seperately with automated azimuth pipeline

# Load Libraries

In [2]:
import scanpy as sc
import anndata as an
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import scanorama
import os
import multiprocessing
import random
import time
import git
import sys
from datetime import date

# Load Data

## Configurations

### Technical configurations

In [3]:
multiprocessing.cpu_count()  ### total amount of cpu on the used core

96

In [4]:
len(os.sched_getaffinity(0)) ### amount of available cpus

24

In [5]:
sc.settings.n_jobs   # default number of CPUs to use for parallel computing

1

In [6]:
sc.settings.max_memory  # maximum memory to use in GB

15

In [7]:
random.seed(7)

In [8]:
ncore = '16'

In [9]:
random_state_var = 0

In [10]:
os.environ["OMP_NUM_THREADS"] = ncore
os.environ["OPENBLAS_NUM_THREADS"] = ncore
os.environ["MKL_NUM_THREADS"] = ncore
os.environ["VECLIB_MAXIMUM_THREADS"] = ncore
os.environ["NUMEXPR_NUM_THREADS"] = ncore

In [11]:
sc.logging.print_versions()
sc.set_figure_params(facecolor="white", figsize=(8, 8))
sc.settings.verbosity = 0

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
anndata     0.7.6
scanpy      1.8.1
sinfo       0.3.4
-----
PIL                 8.3.1
annoy               NA
anyio               NA
attr                21.2.0
babel               2.9.1
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
brotli              NA
certifi             2021.05.30
cffi                1.14.6
chardet             4.0.0
charset_normalizer  2.0.0
cycler              0.10.0
cython_runtime      NA
dateutil            2.8.2
decorator           5.0.9
defusedxml       

### Parameters

In [18]:
data_path = '../data/current'

In [19]:
data_path

'../data/current'

In [20]:
result_path = '../results/current'

In [21]:
libraries =['0001', '0002', '0003', '0004', '0005', '0006', '0007', '0008', '0009', '0010', '0011', '0012', '0013', '0014'] # reduced to only 4 libraries for testing

In [22]:
libraries_text = ['L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'L10', 'L11', 'L12', 'L13', 'L14']

In [23]:
libraries_dict = {'0001': 'L1', '0002': 'L2', '0003': 'L3', '0004': 'L4', '0005': 'L5',  '0006': 'L6', '0007': 'L7', '0008': 'L8', '0009': 'L9', '0010': 'L10', '0011': 'L11', '0012': 'L12', '0013': 'L13', '0014': 'L14'}

## Meta-Data File

In [24]:
path = data_path + "/preprocessed-data/meta-data/Mapping_Hashtag_Sample.csv"
library_hashtag_mapping = pd.read_csv(path) 
print('Last modified' + time.ctime(os.path.getmtime(path)))

Last modifiedThu Sep  2 13:17:03 2021


In [25]:
# library_hashtag_mapping

## RNA data from A6

In [29]:
### use data from A6

In [30]:
anndata_dict = dict.fromkeys(libraries_text)

In [31]:
#anndata_dict

In [170]:
for key in anndata_dict:
    dataset_path = data_path + "/analysis/A/A6_Processed_" + key + '_rna_Final.h5ad'
    
    print(dataset_path)
    print('Last modified' + time.ctime(os.path.getmtime(dataset_path)))
    
    adata_orig = sc.read_h5ad(dataset_path)
    adata_orig.var_names_make_unique()  # make variable names unique
    sc.pp.filter_genes(adata_orig, min_cells=1)  # filter genes
    
    anndata_dict[key]= adata_orig # save anndata in dictionary
    anndata_dict[key].uns['data_load_time'] = time.ctime(os.path.getmtime(dataset_path))  # save the last modified timestamp of the data loaded
    anndata_dict[key].uns['data_load_name'] = dataset_path # save the data path of the data loaded 
    

../data/current/analysis/A/A6_Processed_L1_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:11 2023
../data/current/analysis/A/A6_Processed_L2_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:14 2023
../data/current/analysis/A/A6_Processed_L3_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:18 2023
../data/current/analysis/A/A6_Processed_L4_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:22 2023
../data/current/analysis/A/A6_Processed_L5_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:27 2023
../data/current/analysis/A/A6_Processed_L6_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:31 2023
../data/current/analysis/A/A6_Processed_L7_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:35 2023
../data/current/analysis/A/A6_Processed_L8_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:39 2023
../data/current/analysis/A/A6_Processed_L9_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:43 2023
../data/current/analysis/A/A6_Processed_L10_rna_Final.h5ad
Last modifiedFri Jan 20 11:51:47 2023
../data/current/analysis/A/A6_Processed

In [171]:
### Short data check

In [172]:
anndata_dict['L1']

AnnData object with n_obs × n_vars = 18549 × 21407
    obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'd

In [173]:
anndata_dict['L1'].X.sum(axis=1)

matrix([[2192.7395],
        [1646.0859],
        [2101.9817],
        ...,
        [2003.8291],
        [1930.2568],
        [2271.1406]], dtype=float32)

In [174]:
anndata_dict['L1'].raw.X.sum(axis=1)

matrix([[6685.],
        [3167.],
        [7907.],
        ...,
        [4105.],
        [4064.],
        [7343.]], dtype=float32)

# Investigation and Tests

In [175]:
pd.unique(anndata_dict['L1'].obs['HTO_classification.global'])

['Doublet', 'Negative', 'Singlet']
Categories (3, object): ['Doublet', 'Negative', 'Singlet']

# Generate AZIMUTH Input Data

In [176]:
### Filter on Singlets

In [177]:
for key in anndata_dict:
    anndata_dict[key] = anndata_dict[key][anndata_dict[key].obs['HTO_classification.global']=='Singlet']
    del anndata_dict[key].uns
    
    

In [178]:
pd.unique(anndata_dict['L1'].obs['HTO_classification.global'])

['Singlet']
Categories (1, object): ['Singlet']

In [179]:
anndata_dict['L1']

AnnData object with n_obs × n_vars = 12113 × 21407
    obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'd

In [180]:
### Remove rb and mitochondrial genes (only do this when using already filtered data for rb %)

In [181]:
for key in anndata_dict:
        anndata_dict[key] = anndata_dict[key][:,np.logical_and(anndata_dict[key].var['rb'] == False , anndata_dict[key].var['mt'] == False)]

In [182]:
anndata_dict['L1']

View of AnnData object with n_obs × n_vars = 12113 × 21294
    obs: 'nCount_HTO', 'nFeature_HTO', 'nCount_RNA', 'nFeature_RNA', 'percent_mt', 'HTO_maxID', 'HTO_secondID', 'HTO_margin', 'HTO_classification', 'HTO_classification.global', 'hash.ID', 'HTO_classification_final', 'name', 'library', 'id', 'read', 'pattern', 'sequence', 'feature_type', 'Unnamed: 7', 'in_sample', 'display_name', 'group', 'HTO_Doublet_Classification', 'doublet_score', 'predicted_doublet', 'A5_scrublet_predicted_doublet_lib0.1', 'A5_scrublet_doublet_score_lib_0.1', 'A5_scrublet_predicted_doublet_lib0.2', 'A5_scrublet_doublet_score_lib_0.2', 'A5_scrublet_predicted_doublet_lib', 'A5_scrublet_doublet_score_lib', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'valid_cell_filter_dying', 'valid_cell_filter_doublet'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'me

In [183]:
### Further data adaptions

In [184]:
for key in anndata_dict:
    for col in anndata_dict[key].obs.columns:
        if pd.api.types.is_categorical_dtype(anndata_dict[key].obs[col]):
            anndata_dict[key].obs[col]=anndata_dict[key].obs[col].cat.add_categories("missing").fillna('missing')

Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.
Trying to set attribute `.obs` of view, copying.


In [188]:
#for key in anndata_dict:
    #del anndata_dict[key].obs['Unnamed: 7']
    #del anndata_dict[key].obs['A5_scrublet_predicted_doublet_lib0.1']
    #del anndata_dict[key].obs['A5_scrublet_predicted_doublet_lib0.2']
    #del anndata_dict[key].obs['A5_scrublet_doublet_score_lib_0.1']

In [189]:
for key in anndata_dict:
    del anndata_dict[key].obs['HTO_classification.global']

In [190]:
for key in anndata_dict:
    del anndata_dict[key].obs['hash.ID']

In [191]:
for key in anndata_dict:
    del anndata_dict[key].layers

In [192]:
for key in anndata_dict:
    del anndata_dict[key].obsm

In [206]:
anndata_dict[key].obs

Unnamed: 0,nCount_HTO,nFeature_HTO,nCount_RNA,nFeature_RNA,percent_mt,HTO_maxID,HTO_secondID,HTO_margin,HTO_classification,HTO_classification_final,...,A5_scrublet_doublet_score_lib_0.2,A5_scrublet_predicted_doublet_lib,A5_scrublet_doublet_score_lib,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt,valid_cell_filter_dying,valid_cell_filter_doublet
AAACCCACATACAGGG-1,1574.0,7,3947.0,1232,7.980745,9.1,1.1,2.253048,9.1,9.1,...,0.100684,False,0.100684,1232,1232,3947.0,315.0,7.980745,True,True
AAACCCACATGACTTG-1,379.0,7,5871.0,1890,7.256004,9.1,7.1,0.943193,9.1,9.1,...,0.015934,False,0.015934,1890,1890,5871.0,426.0,7.256004,True,True
AAACCCAGTCATCAGT-1,421.0,7,3889.0,1297,4.551299,7.1,1.1,2.265289,7.1,7.1,...,0.104063,False,0.104063,1297,1297,3889.0,177.0,4.551299,True,True
AAACCCAGTGGTAATA-1,478.0,7,5094.0,1327,6.340793,9.1,8.1,1.319735,9.1,9.1,...,0.403509,False,0.403509,1327,1327,5094.0,323.0,6.340793,True,True
AAACCCATCATCACAG-1,851.0,7,17480.0,2363,2.665904,9.1,5.1,1.629594,9.1,9.1,...,0.124464,False,0.124464,2363,2359,17326.0,466.0,2.689599,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGGTACCACGC-1,582.0,7,6647.0,2297,9.658493,1.1,9.1,1.017658,1.1,1.1,...,0.051482,False,0.051482,2297,2297,6647.0,642.0,9.658493,True,True
TTTGTTGGTGTCCAAT-1,379.0,7,2965.0,823,6.610455,8.1,5.1,1.425397,8.1,8.1,...,0.037681,False,0.037681,823,823,2965.0,196.0,6.610455,True,True
TTTGTTGTCAGACTGT-1,353.0,7,4259.0,1261,5.541207,5.1,8.1,1.482897,5.1,5.1,...,0.155875,False,0.155875,1261,1261,4259.0,236.0,5.541207,True,True
TTTGTTGTCCGTAGGC-1,250.0,7,4105.0,1250,2.752741,7.1,5.1,1.618696,7.1,7.1,...,0.168961,False,0.168961,1250,1250,4105.0,113.0,2.752741,True,True


In [208]:
anndata_dict[key].obs[['nCount_RNA' , 'nFeature_RNA']]

Unnamed: 0,nCount_RNA,nFeature_RNA
AAACCCACATACAGGG-1,3947.0,1232
AAACCCACATGACTTG-1,5871.0,1890
AAACCCAGTCATCAGT-1,3889.0,1297
AAACCCAGTGGTAATA-1,5094.0,1327
AAACCCATCATCACAG-1,17480.0,2363
...,...,...
TTTGTTGGTACCACGC-1,6647.0,2297
TTTGTTGGTGTCCAAT-1,2965.0,823
TTTGTTGTCAGACTGT-1,4259.0,1261
TTTGTTGTCCGTAGGC-1,4105.0,1250


In [209]:
for key in anndata_dict:
    anndata_dict[key].obs = anndata_dict[key].obs[['nCount_RNA' , 'nFeature_RNA']]

In [210]:
### Get raw data
for key in anndata_dict:
    anndata_dict[key] = anndata_dict[key].raw.to_adata()
    anndata_dict[key] = anndata_dict[key].copy()

AttributeError: 'NoneType' object has no attribute 'to_adata'

In [211]:
anndata_dict['L1'].X.sum(axis=1)

matrix([[3947.],
        [5871.],
        [3889.],
        ...,
        [4259.],
        [4105.],
        [4064.]], dtype=float32)

In [212]:
anndata_dict['L1'].obs

Unnamed: 0,nCount_RNA,nFeature_RNA
AAACCCACATACAGGG-1,3947.0,1232
AAACCCACATGACTTG-1,5871.0,1890
AAACCCAGTCATCAGT-1,3889.0,1297
AAACCCAGTGGTAATA-1,5094.0,1327
AAACCCATCATCACAG-1,17480.0,2363
...,...,...
TTTGTTGGTACCACGC-1,6647.0,2297
TTTGTTGGTGTCCAAT-1,2965.0,823
TTTGTTGTCAGACTGT-1,4259.0,1261
TTTGTTGTCCGTAGGC-1,4105.0,1250


In [214]:
anndata_dict['L1']

AnnData object with n_obs × n_vars = 12113 × 21415
    obs: 'nCount_RNA', 'nFeature_RNA'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells', 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'

In [217]:
anndata_dict[key].var[['gene_ids', 'n_cells']]

Unnamed: 0,gene_ids,n_cells
AL627309.1,ENSG00000238009,70
AL627309.3,ENSG00000239945,6
AL627309.4,ENSG00000241599,19
AL669831.2,ENSG00000229905,8
AL669831.5,ENSG00000237491,2237
...,...,...
AL354822.1,ENSG00000278384,330
AC004556.1,ENSG00000276345,1871
AC233755.2,ENSG00000277856,31
AC233755.1,ENSG00000275063,27


In [218]:
for key in anndata_dict:
    anndata_dict[key].var  = anndata_dict[key].var[['gene_ids', 'n_cells']]

In [219]:
anndata_dict['L1'].var

Unnamed: 0,gene_ids,n_cells
AL627309.1,ENSG00000238009,56
AL627309.3,ENSG00000239945,3
AL627309.4,ENSG00000241599,15
AL669831.2,ENSG00000229905,6
AL669831.5,ENSG00000237491,1310
...,...,...
AC007325.4,ENSG00000278817,94
AL354822.1,ENSG00000278384,130
AC004556.1,ENSG00000276345,470
AC233755.2,ENSG00000277856,3


In [220]:
anndata_dict['L1']

AnnData object with n_obs × n_vars = 12113 × 21415
    obs: 'nCount_RNA', 'nFeature_RNA'
    var: 'gene_ids', 'n_cells'

# Save Azimuth Input Data

In [None]:
for key in anndata_dict:
    print(key)
    data_name = data_path + '/analysis/G/' +  'G2_Processed_Singlet' + key + '_rna.h5ad'
    
    print(data_name)
    print('Last modified' + date.today().strftime("%m/%d/%Y, %H:%M:%S"))

    anndata_dict[key].write(data_name)

L1
../data/current/analysis/G/G2_Processed_SingletL1_rna.h5ad
Last modified01/27/2023, 00:00:00
L2
../data/current/analysis/G/G2_Processed_SingletL2_rna.h5ad
Last modified01/27/2023, 00:00:00
L3
../data/current/analysis/G/G2_Processed_SingletL3_rna.h5ad
Last modified01/27/2023, 00:00:00
L4
../data/current/analysis/G/G2_Processed_SingletL4_rna.h5ad
Last modified01/27/2023, 00:00:00
L5
../data/current/analysis/G/G2_Processed_SingletL5_rna.h5ad
Last modified01/27/2023, 00:00:00
L6
../data/current/analysis/G/G2_Processed_SingletL6_rna.h5ad
Last modified01/27/2023, 00:00:00
L7
../data/current/analysis/G/G2_Processed_SingletL7_rna.h5ad
Last modified01/27/2023, 00:00:00
L8
../data/current/analysis/G/G2_Processed_SingletL8_rna.h5ad
Last modified01/27/2023, 00:00:00
