## Notebook for the Khaliq 2022 anndata file creation
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 12 October 2022

#### Load required packages

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

scanpy==1.6.0 anndata==0.7.4 umap==0.3.10 numpy==1.18.2 scipy==1.4.1 pandas==1.0.4 scikit-learn==0.22.2.post1 statsmodels==0.13.5 python-igraph==0.10.2 louvain==0.7.0 leidenalg==0.8.0


#### Upload Data

In [5]:
#Data Upload (csv)
UMI_counts = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Khaliq-2022/Raw data/GSE200997_GEO_processed_CRC_10X_raw_UMI_count_matrix.csv.gz', index_col=[0])

In [6]:
#Checking the file
UMI_counts

Unnamed: 0,B_cac10_AAACCTGAGTCAATAG,B_cac10_AAACCTGCACAGCCCA,B_cac10_AAACCTGCACTTCGAA,B_cac10_AAACCTGGTAATTGGA,B_cac10_AAACCTGGTACGAAAT,B_cac10_AAACCTGGTGAAAGAG,B_cac10_AAACCTGTCACGATGT,B_cac10_AAACCTGTCTGCGACG,B_cac10_AAACGGGAGTTGTCGT,B_cac10_AAACGGGCAGCGTTCG,...,T_cac9_TTTCCTCTCCGCATCT,T_cac9_TTTCCTCTCTATGTGG,T_cac9_TTTGCGCAGGCTAGCA,T_cac9_TTTGCGCTCGCTAGCG,T_cac9_TTTGGTTCAAATACAG,T_cac9_TTTGGTTCAACACGCC,T_cac9_TTTGGTTCATAGACTC,T_cac9_TTTGGTTCATCTATGG,T_cac9_TTTGGTTTCAATCTCT,T_cac9_TTTGTCAGTCCAACTA
AL627309.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AL669831.5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM87B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LINC00115,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM41C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AL354822.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC004556.1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
AC233755.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
AC233755.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Creating the anndata file

In [7]:
#To create correct anndata file we need to have barcodes as rows names and genes as columns names, so we are transposing the file
UMI_counts = UMI_counts.T

In [8]:
#Checking the file
UMI_counts

Unnamed: 0,AL627309.1,AL669831.5,FAM87B,LINC00115,FAM41C,AL645608.3,AL645608.1,SAMD11,NOC2L,KLHL17,...,MAFIP,AC011043.1,AL592183.1,AC007325.4,AC007325.2,AL354822.1,AC004556.1,AC233755.2,AC233755.1,AC240274.1
B_cac10_AAACCTGAGTCAATAG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B_cac10_AAACCTGCACAGCCCA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B_cac10_AAACCTGCACTTCGAA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B_cac10_AAACCTGGTAATTGGA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
B_cac10_AAACCTGGTACGAAAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
T_cac9_TTTGGTTCAACACGCC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
T_cac9_TTTGGTTCATAGACTC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
T_cac9_TTTGGTTCATCTATGG,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
T_cac9_TTTGGTTTCAATCTCT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Data Upload (csv)
meta_data = pd.read_csv('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Khaliq-2022/Raw data/GSE200997_GEO_processed_CRC_10X_cell_annotation.csv.gz', index_col=[0])

In [10]:
#Checking the file
meta_data

Unnamed: 0,samples,Condition,Location,MSI_Status,bulk_prediction,prediction
B_cac10_AAACCTGAGTCAATAG,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGCACAGCCCA,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGCACTTCGAA,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGGTAATTGGA,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGGTACGAAAT,B_cac10,Normal,Left,MSS,,
...,...,...,...,...,...,...
T_cac9_TTTGGTTCAACACGCC,T_cac9,Tumor,Left,MSS,CMS2,CMS4
T_cac9_TTTGGTTCATAGACTC,T_cac9,Tumor,Left,MSS,CMS2,CMS4
T_cac9_TTTGGTTCATCTATGG,T_cac9,Tumor,Left,MSS,CMS2,CMS1
T_cac9_TTTGGTTTCAATCTCT,T_cac9,Tumor,Left,MSS,CMS2,CMS1


In [15]:
#Create anndata file 
khaliq_2022_anndata = an.AnnData(X=UMI_counts,
                        obs=meta_data,
                        var=UMI_counts.T.iloc[:,:0])

In [16]:
#Checking anndata file
khaliq_2022_anndata.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [17]:
#Checking anndata file
khaliq_2022_anndata.var

AL627309.1
AL669831.5
FAM87B
LINC00115
FAM41C
...
AL354822.1
AC004556.1
AC233755.2
AC233755.1
AC240274.1


In [18]:
#Checking anndata file
khaliq_2022_anndata.obs

Unnamed: 0,samples,Condition,Location,MSI_Status,bulk_prediction,prediction
B_cac10_AAACCTGAGTCAATAG,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGCACAGCCCA,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGCACTTCGAA,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGGTAATTGGA,B_cac10,Normal,Left,MSS,,
B_cac10_AAACCTGGTACGAAAT,B_cac10,Normal,Left,MSS,,
...,...,...,...,...,...,...
T_cac9_TTTGGTTCAACACGCC,T_cac9,Tumor,Left,MSS,CMS2,CMS4
T_cac9_TTTGGTTCATAGACTC,T_cac9,Tumor,Left,MSS,CMS2,CMS4
T_cac9_TTTGGTTCATCTATGG,T_cac9,Tumor,Left,MSS,CMS2,CMS1
T_cac9_TTTGGTTTCAATCTCT,T_cac9,Tumor,Left,MSS,CMS2,CMS1


In [19]:
#Save anndata
khaliq_2022_anndata.write('/lustre/groups/talaveralopez/datasets/Colorectal_cancer/Raw_data/Khaliq-2022/Anndata raw/Khaliq-2022_anndata_raw_2_for_scnym.h5ad')

... storing 'samples' as categorical
... storing 'Condition' as categorical
... storing 'Location' as categorical
... storing 'MSI_Status' as categorical
... storing 'bulk_prediction' as categorical
... storing 'prediction' as categorical
