1. Download
* 

In [None]:
!mkdir ./DataDir

In [None]:
import os
import sys

import numpy as np
import pandas as pd
import anndata as ad
import scanpy as sc
from scipy.sparse import csr_matrix, isspmatrix

import matplotlib.pyplot as plt
import scanpy.external as sce

from datetime import datetime
import ipynbname

print("Scanpy version: ", sc.__version__)
print("Pandas version: ", pd.__version__)

In [None]:
# Custom functions
sys.path.append('../HelperFunctions')
import Helper as fn

In [None]:
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=120)

In [None]:
path = './DataDir/'
Id = 'Id0003'

input_file = path + Id + '/0_Starting/GSE162170_rna_counts.tsv'
meta_file = path + Id + '/0_Starting/GSE162170_rna_cell_metadata.txt'
output_file = path + Id + '/1_AssembledAdata.h5ad'

In [None]:
print(datetime.now())

In [None]:
adata = sc.read_csv(input_file, delimiter='\t', first_column_names=None, dtype='float32') 

In [None]:
if isspmatrix(adata.X) == False:
    adata.X = csr_matrix(adata.X)
    print('Converted adata.X to', type(adata.X))

In [None]:
adata = adata.transpose()
adata

In [None]:
meta = pd.read_csv(meta_file, sep = '\t', index_col='Cell.ID')
meta = meta.add_prefix('Auth_')
print(meta.shape)
meta.head()

In [None]:
#check that adata.X contains integers
if adata.shape[0] <= 250000: 
    print('Checking complete adata')
    if np.equal(np.mod(adata.X.toarray(), 1), 0).all() != True:
        print('CAREFUL: non-integer matrix loaded!')

if adata.shape[0] > 250000:
    print ('Checking a subsample of adata')
    if np.equal(np.mod(sc.pp.subsample(adata, n_obs=250000, random_state=0, copy=True).X.toarray(), 1), 0).all() != True:
        print('CAREFUL: non-integer matrix loaded!')

In [None]:
print(adata.X[45:50, 45:50])

In [None]:
maxCount = csr_matrix.max(adata.X)
print(maxCount)

if maxCount < 32000:
    print('Change X type to integer')
    adata.X = adata.X.astype('uint16')
else: 
    print('X type not changed')

In [None]:
print(adata.X[45:50, 45:50])
print(csr_matrix.max(adata.X))

if (csr_matrix.max(adata.X)) != maxCount:
    print('CAREFUL: max count value has changed!')

In [None]:
if meta.shape[0] != adata.obs.shape[0]:
    print('CAREFUL: expression matrix and metadata size are not coherent!')
    print ('Metadata rows: ' + str(meta.shape[0]))

In [None]:
adata.obs = adata.obs.join(meta, how='left', validate='one_to_one')
del meta

In [None]:
adata.obs.head(3)

In [None]:
annot = sc.queries.biomart_annotations(
        "hsapiens",
        ["ensembl_gene_id", "external_gene_name","start_position", "end_position", "chromosome_name"],
    ).set_index("ensembl_gene_id")

In [None]:
adata.var = adata.var.join(annot, how='left', validate='one_to_one')
print(adata.var['external_gene_name'].isnull().sum())
del annot

In [None]:
adata.var['ensg'] = adata.var.index.tolist()
adata.var['external_gene_name'] = adata.var['external_gene_name'].fillna(adata.var.ensg)

In [None]:
#replace external gene name as index
adata.var.index = adata.var['external_gene_name']
adata.var.drop('external_gene_name', axis = 1, inplace = True)
adata.var_names_make_unique()

In [None]:
adata.var.head(3)

In [None]:
adata.obs['dataset_id'] = 'Id0003'
adata.obs['sample_id'] = adata.obs['Auth_Sample.ID']
adata.obs['brain_region'] = 'cerebral_cortex'
adata.obs['age'] = adata.obs['Auth_Age'].str.replace('pcw', 'PCW_')
adata.obs['stage'] = 'prenatal'
adata.obs['batch_key'] = adata.obs['Auth_Batch'] + '_' + adata.obs['Auth_Assay'].str.replace(' ', '')

In [None]:
adata.obs.head(3)

In [None]:
print('Available metadata for each cell: ', adata.obs.columns)

In [None]:
adata.obs['cell_label'].value_counts()

In [None]:
Dict = {'c0': 'ExN_N5', 'c1': 'In_CGE', 'c2': 'ExN_N1',
        'c3': 'In_MGE', 'c4': 'ExN_N4', 'c5': 'ExN_N2', 
        'c6': 'RG_early', 'c7': 'ExN_N7', 'c8': 'CycProg', 
        'c9': 'ExN_N3', 'c10': 'RG_late', 'c11': 'GliaPg', 
        'c12': 'ExN_N6', 'c13': 'SubPlate', 'c14': 'IPC', 
        'c15': 'ExN_N8', 'c16': 'Microglia', 'c17': 'OPC_Oligo', 
        'c18': 'tRG', 'c19': 'Pericytes', 'c20': 'Endo',
        'c21': 'RBC', 'c22': 'VLMC'
       }

adata.obs['cell_label'] = adata.obs['Auth_seurat_clusters'].replace(Dict)

In [None]:
plt.rcParams['figure.figsize'] = [7, 3.5]
fn.metaBarplot(adata, 'cell_label')

In [None]:
pd.crosstab(adata.obs['sample_id'], adata.obs['Auth_Tissue.ID'], dropna=False)

In [None]:
pd.crosstab(adata.obs['sample_id'], adata.obs['age'], dropna=False)

In [None]:
fn.metaBarplot(adata, 'sample_id')

In [None]:
pd.crosstab(adata.obs['Auth_Sample.Type'], adata.obs['sample_id'], dropna=False)

In [None]:
pd.crosstab(adata.obs['Auth_Assay'], adata.obs['sample_id'], dropna=False)

In [None]:
pd.crosstab(adata.obs['Auth_Batch'], adata.obs['sample_id'], dropna=False)

In [None]:
pd.crosstab(adata.obs['Auth_Batch'], adata.obs['Auth_Assay'], dropna=False)

In [None]:
fn.metaBarplot(adata, 'batch_key')

In [None]:
adata

In [None]:
if isspmatrix(adata.X) == False:
    adata.X = csr_matrix(adata.X)
    print('Converted adata.X to', type(adata.X))

In [None]:
adata.write(output_file, compression='gzip')

In [None]:
print(datetime.now())

### Reference
* https://github.com/BrainOmicsCourse/BrainOmics2024/blob/main/1_Day1/Resources.md
* https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE162170
* https://github.com/BrainOmicsCourse/BrainOmics2024/blob/main/1_Day1/Compiled/0_AssembleAdata.html