# Import modules and notebook settings

In [None]:
# Keep modules updated if they change within the session
%load_ext autoreload
%autoreload 2

# Load modules
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import squidpy as sq
import anndata as ad
import scanpy as sc

# Settings
import warnings
warnings.filterwarnings('ignore')
#sc.settings.set_figure_params(dpi=80)

# Print versions
sc.logging.print_header()
print(f"squidpy=={sq.__version__}")

# Create AnnData object

### Current required structure for seamless functionality:
 
 - **Root folder** (e.g. Hartmann-2021)
     - **metadata.xlsx**
     - **experiment_folder/**        *(e.g. scMEP_MIBI_singlecell)*
         - **main_matrix.csv**        nested matrix with columns = ['point', 'cell_id', $markers ... , 'center_rowcoord', 'center_colcoord', 'cell_size', 'donor', 'category', 'Cluster', 'library_id']         (e.g. scMEP_MIBI_singlecell.csv)
         - **image_folder/**        *(e.g. scMEP_MIBI_colon_images)*
             - **donor_folders/**        1 folder per donor/sample; ==obs['donor'] !!!
                 - **point/FOV folders**        1 folder per point/FOV; == 'librarary_id'; names must not repeat across donors!; requires['cell_id', 'center_rowcoord', 'center_colcord', 'point', 'donor', 'library_id] *(e.g. Poin21, ..., Point32)*
                - image_files.tif        == markers/features/channels; names can repeat across donord; *(e.g. 147_vimentin.tif)*
         - **segmentation_folder/**        *(e.g. segmentation)*
              - **segmentation_files.tif**        requires obs['cell_id', 'point', 'donor', 'library_id']; *(e.g. Point1_Label_Map.tif)*

### Prepare the data

In [None]:
# Annotations come in a single matrix
data_unformatted = pd.read_csv("input-data/raw-data/Hartmann-2021/scMEP_MIBI_singlecell/scMEP_MIBI_singlecell.csv")
data_unformatted = data_unformatted.dropna(axis=0)

# Some manual matrix formatting
data_unformatted['library_id'] = 'Point' + data_unformatted['point'].astype(str)
data_unformatted['new_index'] = data_unformatted['library_id'].astype(str) + '_' + data_unformatted['cell_id'].astype(str)
data_unformatted = data_unformatted.set_index(pd.Index(data_unformatted['new_index'].to_numpy()))
data_unformatted = data_unformatted.drop('new_index', axis=1)

# Subset and format to adjust to AnnData format
counts = data_unformatted.iloc[:, 3:39]

## Due to AnnData bugs (2022-12-05) we need to either supply a numpy.array or recently created pandas.DataFrame
X = pd.DataFrame(
        counts.to_numpy(),
        columns = list(counts.columns),
        index = list(counts.index)
)
obs = data_unformatted.iloc[:, np.r_[0:3, 39:len(data_unformatted.columns)]]

# adata. obsm: Spatial location of spots
obsm = {
    'spatial': data_unformatted.loc[:, ['center_colcoord', 'center_rowcoord']].to_numpy()
}

# adata.uns: Metadata
sample_info = pd.read_excel(
    "input-data/raw-data/Hartmann-2021/scMEP_sample_description.xlsx",
).to_dict(orient='list')

uns = {
    'sample_info': sample_info,
    'Cluster_colors': [
        '#1f77b4',
        '#ff7f0e',
        '#2ca02c',
        '#d62728',
        '#9467bd',
        '#8c564b',
        '#e377c2',
        '#7f7f7f'
      ]
}


### Create the object

In [None]:
# Create AnnData object
adata = ad.AnnData(
    X = X, 
    obs = obs, 
    obsm = obsm,
    uns = uns
)

### View

In [None]:
adata.to_df()

In [None]:
adata.obs

In [None]:
adata.obsm

In [None]:
adata.uns.keys()

# Integrate image and segmentation mask data

### User input

*Previous to **commit e03e83d38317f2ca379851badd8b0f6896474f94**, the input was based on a single sample folder, containing one folder per point/FOV, containing all the image files (.tiff) of each channel*

In [None]:
# The user must choose 3 fluorophore channels to use as RGB raster format
user_input_channels = ['145_CD45', '174_CK', '113_vimentin']
image_dir = "input-data/raw-data/Hartmann-2021/scMEP_MIBI_singlecell/scMEP_MIBI_colon_images/90de_tumor/"
segmentation_dir = "input-data/raw-data/Hartmann-2021/scMEP_MIBI_singlecell/segmentation/"

### Algorithm initiation and adding images

 - This algorithm processes both images and segmentation masks separately
 - It also checks that the file names are correctly attributed based on `obs['library_id']`

In [None]:
image_dir = 'input-data/raw-data/Hartmann-2021/scMEP_MIBI_singlecell/scMEP_MIBI_colon_images/'

# Initialize the spatial dictionary
spatial_dict = {}

# Some variables
library_ids = adata.obs['library_id'].unique()

for donor in os.listdir(image_dir):

    donor_path = os.path.join(image_dir, donor)

    # Check if folder structure correct: layer 1
    if not os.path.isdir(donor_path):
        raise Exception('`image_dir` directory must only contain directories named after the donors.')
    
    # Check if folder names coincide with donor value in obs matrix
    elif not all(np.in1d(donor, adata.obs['donor'].to_list())):
        raise Exception('First layer folders are not named as in obs[\'donor\']')
    
    else:

        # Define points/FOV
        points = sorted(os.listdir(donor_path))

        # Check that point names are found in the `obs['library_id']`
        if not all(np.in1d(points, library_ids)):
            raise Exception("Image file names must be named equal to library_ids")




        # Add images
        
        for point in points:
    
            # Init dict layer 1: 'library_id' = {images, scalefactors}
            spatial_dict[point] = {}

            # Init dict layer 2: 'images' = {hires, segmentation}
            spatial_dict[point]['images'] = {}
    
            # Fill 'image' key in layer 2
            # 3 channel images as RGB raster 
            # Read .tif data as array
            # Can also be done with scipy or python image library PIL
            channel_arr = np.array(
                [
                    plt.imread(os.path.join(donor_path, point, user_input_channels[0]) + ".tif"),
                    plt.imread(os.path.join(donor_path, point, user_input_channels[1]) + ".tif"),
                    plt.imread(os.path.join(donor_path, point, user_input_channels[2]) + ".tif")
                ]
            ).transpose(1, 2, 0)
    
            spatial_dict[point]['images']['hires'] = channel_arr
    
            # Fill 'scalefactors' key in layer 1
            spatial_dict[point]['scalefactors'] = {
                'spot_diameter_fullres': 15,
                'tissue_hires_scalef': 1,
            }

### Add segmentation data

- This algorithm processes both images and segmentation masks separately
- Checks the validity of the segmentation files by trying to attribute the their names to the corresponding image file names by string content.

- In the future maybe integrate two alterntive paths:
    - argument indicating a dictionary of file links
    - argument indicating the file name changes/presuffixes that apply to the image file name to arrive at the segmentation file name

In [None]:
# Check if segmentation file names contain the library_id
seg_files = sorted(os.listdir(segmentation_dir))

## Initiate segmentation-file-name to library_id dict
dict_seg_lib_id = {}

for seg_file in seg_files:

    seg_id = seg_file[:-15] ### !!! Can be improved via e.g. regular expressions

    if seg_id in set(library_ids):
        dict_seg_lib_id[seg_file] = seg_id

    else:
        raise Exception('All segmentation files must contain the of an image file which are based on obs[\'library_id\']')


# Add segmentation information
for key, value in dict_seg_lib_id.items():

    # Fill 'segmentation' key in layer 2
    spatial_dict[value]['images']['segmentation'] = plt.imread(segmentation_dir + key)
    
    # Add 'tissue_segmentation_scalef' key in layer 1
    spatial_dict[value]['scalefactors']['tissue_segmentation_scalef'] = 1

### Final integration

In [None]:
# Integrate spatial dictionary into adata
adata.uns['spatial'] = spatial_dict

In [None]:
sq.pl.spatial_scatter(adata, color=[None], library_key='library_id', library_id=['Point1', 'Point21', 'Point41', 'Point45'])

# Visualize

In [None]:
sq.pl.spatial_scatter(adata, color=['Cluster'], library_key='library_id', library_id=['Point8'])

### Segmentation mask

In [None]:
sq.pl.spatial_segment(adata, library_key='library_id', library_id=['Point8', 'Point16'], seg_cell_id='cell_id', color='Cluster')

### Expression under mask

In [None]:
sq.pl.spatial_segment(
    adata,
    color='CD45', # Immune cell marker (differentiated)
    library_key='library_id',
    library_id=['Point8', 'Point16'],
    seg_cell_id='cell_id',
    cmap='inferno',
    scalebar_dx=2.0,
    scalebar_kwargs={'scale_loc': 'bottom', 'location': 'lower right'}
)

# Expression of some specific markers

In [None]:
sq.pl.spatial_scatter(
    adata,
    color=[None,
           'CD45', # Hematopoietic cells (differentiated)
           'GLUT1', # Glucose transporter
           'CD98', # Amino acid transporter, auth: found polarized towards tumour-immune border
           'ASCT2', # -||-
           'PD1', # Immune activation marker
           'CD39', # -||-
           'CK', # Cancer marker
           ],
    library_key='library_id',
    library_id='Point1')

# Create spatial connectivity graph

In [None]:
sq.gr.spatial_neighbors(adata, n_neighs=10, coord_type='generic')

# Save adata object

In [None]:
adata.write('output-data/Hartmann-2021/hartmann.h5ad', compression='gzip')

# NCEM

Output:
 - Coupling analysis: DEGs
 - Coupling analysis circular: DEGs or magnitude
 - Sender effect on receiving cell
 - Sender-receiver vulcano plot
 - **Sender similarity analysis**

Not functioning:
 - Grid searches over neighbourhood sizes + Length scales of dependencies for different target cell-types
 - (Squidpy CellPhoneDB)

Outlook:
- Future custom loader


In [None]:
import ncem

### 0. Loading the data with the predefined data loader

In [None]:
interpreter = ncem.interpretation.interpreter.InterpreterInteraction()
interpreter.get_data(
    data_origin='hartmann', # Reference to specfic DataLoader (see notes in Dropbox paper)
    data_path='input-data/raw-data/Hartmann-2021/',
    radius=35,
    node_label_space_id='type',
    node_feature_space_id='standard',
)
print('\n Celldata metadata:', interpreter.data.celldata.uns['metadata'])

In [None]:
# node degrees for different radii
interpreter.data.plot_degree_vs_dist(
    max_distances=[0, 10, 50, 200, 1000],
    lateral_resolution=400/1024, # Physical cell resolution
)

### 2. Extracting sender-receiver effects with NCEM

In [None]:
# splitting data into test and validation sets, can be ignored for non sender-receiver focused analysis
interpreter.split_data_node(0.1, 0.1)
interpreter.n_eval_nodes_per_graph = 10
interpreter.cell_names = list(interpreter.data.celldata.uns['node_type_names'].values())

In [None]:
interpreter.get_sender_receiver_effects()

### Type coupling analysis

Heatmap color is proportional to the number of differentially expressed genes at a FDR-corrected p-value threshold of 0.05 for each pair of sender and receiver cell types. 

In [None]:
interpreter.type_coupling_analysis(figsize=(6.5, 5.5))

Edge width proportional to the number of differentially expressed genes at a FDR-corrected p-value threshold of 0.05 for each pair of sender and receiver cell types. Only edges with eat least 24 differentially expressed genes are shown.


**Arrow thickness:** strength of directional dependencies between cell types as the magnitude of the corresponding coefficient vector

In [None]:
interpreter.type_coupling_analysis_circular(
    edge_attr='magnitude', # IDF: Alternatives should be: ["magnitude", 'de_genes', 'de_genes_abs']
    edge_width_scale=3.5,
    de_genes_threshold=24,
    figsize=(9,9),
    text_space=1.35
)

### IDF: edge thickness = #(DEGs) to relate to the coupling analysis heatmap

In [None]:
interpreter.type_coupling_analysis_circular(
    edge_attr='de_genes',
    edge_width_scale=3.5,
    de_genes_threshold=1,
    figsize=(9,9),
    text_space=1.35
)

### 3. Directional sender effects for Epithelial cells and CD8 T cells

We dissected these couplings based on the gene-wise effects of all senders on one receiver type for CD8T cells and Epithelial cells which contextualizes differential expression results of the CD8T cell–Epithelial cell axis.

NCEM correctly identifies genes and proteins that play roles in T-cell activation, regulation of T-cell antigen receptor signalling and immune response.

### a) Sender effects on CD8 T cells 

“Sender effect” analysis heatmap for CD8 T cells. Shown is the estimated log fold change that the sender cell type on the x-axis induces in the gene on the y axis in receiving CD8 T cells.

In [None]:
interpreter.sender_effect(
    receiver='CD8 T cells', 
    gene_subset=['CD8A', 'CD4', 'PTPRC', 'ENTPD1', 'PDCD1', 'CD247'], 
    figsize=(4,5)
)

### b) Sender effects on Epithelial cells

“Sender effect” analysis heatmap for Epithelial cells. Shown is the estimated log fold change that the sender cell type on the y-axis induces in the gene on the x axis in receiving Epithelial cells.

In [None]:
interpreter.sender_effect(
    receiver='Epithelial', 
    gene_subset=['CD8A', 'CD4', 'PTPRC', 'ENTPD1', 'PDCD1', 'CD247'], 
    figsize=(4,5)
)

### c) Directional effect for sender-receiver axis for CD8 T cells on Epithelial cells 

This analysis contextualizes differential expression results of the CD8 Tcell – Epithelial cell axis.

Volcano plot of differentially expressed genes of Epithelial cells in the neighborhood of CD8 T cells.

In [None]:
interpreter.sender_receiver_effect_vulcanoplot(
    sender='CD8 T cells', 
    receiver='Epithelial',
    fold_change_threshold=0.04,
    figsize=(3,5)
)

In [None]:
# low fold change
interpreter.sender_receiver_values(
    sender='CD8 T cells', 
    receiver='Epithelial'
).sort_values(by='fold change').head(5)

In [None]:
# high fold change
interpreter.sender_receiver_values(
    sender='CD8 T cells', 
    receiver='Epithelial'
).sort_values(by='fold change', ascending=False).head(5)

### 4. Sender similarity analysis for Epithelial cells

“Sender similarity analysis” of all sender cell types with respect to Epithelial cell receivers. Shown is a clustered heatmap of Pearson correlation coefficients of the coefficient vectors of each sender type that corresponds to Epithelial cell receivers.

T cell clusters and other immune cells had a similar effect on Epithelial cells in a “sender similarity analysis”, in which we correlated the coefficient vectors of sender cell types that correspond to Epithelial cell receivers, which demonstrates conservation of cell type identity in the sender profile.

In [None]:
interpreter.sender_similarity_analysis(receiver='Epithelial')