# Setup

In [1]:
%%capture
import scanpy as sc
import matplotlib.pyplot as plt
import warnings
import os
import scvi
import pandas as pd
import anndata as ad
import seaborn as sns
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import LinearSegmentedColormap

In [2]:
sc.set_figure_params(figsize=(4,4),  dpi=80, dpi_save=300, facecolor="white", frameon=False)
sns.set_context("paper")
plt.rcParams["axes.grid"] = False
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
os.getcwd()

'/home/joe/Repositories/M-cells/code/WT_AireKO_Fezf2_thymus'

## Setup data paths

### demeter

In [4]:
rawDataPath = "/mnt/iacchus/joe/raw_data/"
processedDataPath = "/mnt/iacchus/joe/processed_data/"

### gardner-lab-computer

In [5]:
# rawDataPath = "/mnt/e/Archive/Joe/raw_data/"
# processedDataPath = "/mnt/e/Archive/Joe/processed_data/"

## Import data

In [6]:
adata = sc.read(f"{processedDataPath}/M_cell/h5ad_files/combined_WT_fezf2KO_aireKO_all_genes.h5ad")
signatures = pd.read_csv("../../analysis/cell_type_signatures/merged_epithelium_fine_skin_signatures.csv", index_col=0, header=0)

## Methods

### create signature matrix

In [7]:
def create_signature_matrix(adata, obs_key, save_path):
    signatures = pd.DataFrame(index=adata.var_names)
    for cell_type in adata.obs[obs_key].unique():
        signatures[cell_type] = list(pd.DataFrame(adata[adata.obs[obs_key] == cell_type].X.todense()).mean(axis=0))
    signatures.to_csv(save_path)
    return signatures

### cosine similarity

In [8]:
# adata must have log normalized counts in its raw attribute
# signatures must be a gene (row) by cell type (column) dataframe with average log normalized counts
def cell_type_cosine_similarity(adata, signatures, scale=True, label="cosine_similarity"):
    # Raw attribute contains the log-normalized counts
    cell_data = pd.DataFrame(adata.raw.X.toarray(), columns=adata.raw.var_names, index=adata.obs.index).T
    cell_data = cell_data[cell_data.index.isin(adata.var_names)]
    cell_data = cell_data / cell_data.sum(axis=0) # Normalize per cell expression to sum to 1
    all_df = cell_data.join(signatures, how="inner") # Join with gene x cell type signature matrix so that only shared genes are kept
    all_df = all_df / all_df.sum(axis=0) # Normalize per cell expression to sum to 1
    print(f"Number of genes used for cosine similarity calculations: {len(all_df)}")
    # Calculate cosine similarity of single cells to each cell type
    sim = cosine_similarity(
        all_df[all_df.columns[:cell_data.shape[1]]].values.T, # Single-cell expression data
        all_df[all_df.columns[cell_data.shape[1]:]].values.T # Cell type gene signature expression data
    )

    # Create dataframe with per cell similarity scores for each cell type
    similarity = pd.DataFrame(
        sim, 
        columns=all_df.columns[cell_data.shape[1]:],
        index=all_df.columns[:cell_data.shape[1]]
    )
    if scale:
        # Scale similarity scores by cell
        scaler = StandardScaler()
        minmax_scale = scaler.fit(similarity.T)
        x_minmax = minmax_scale.transform(similarity.T).T
        scale_by_row = pd.DataFrame(x_minmax)
        
        # Add cosine similarity scores to adata obs
        scale_by_row.index = similarity.index
        scale_by_row.columns = similarity.columns + "_" + label
        adata.obs = adata.obs.join(scale_by_row)
    else:
        similarity.columns = similarity.columns + "_" + label
        adata.obs = adata.obs.join(similarity)
    
    return adata

# Cosine similarity

## epithelial signatures

In [9]:
adata = cell_type_cosine_similarity(adata, signatures, label="epithelial_similarity")

Number of genes used for cosine similarity calculations: 13185


In [10]:
adata.obs.columns[adata.obs.columns.str.contains("epithelial_similarity")]

Index(['IFE.C_skin.fine_epithelial_similarity',
       'IFE.B_skin.fine_epithelial_similarity',
       'IFE.SB1_skin.fine_epithelial_similarity',
       'IFE.SB2_skin.fine_epithelial_similarity',
       'uHF.B_skin.fine_epithelial_similarity',
       'uHF.SB_skin.fine_epithelial_similarity',
       'SG_skin.fine_epithelial_similarity',
       'OB_skin.fine_epithelial_similarity',
       'HG_skin.fine_epithelial_similarity',
       'ORS.SB_skin.fine_epithelial_similarity',
       'mCP_skin.fine_epithelial_similarity',
       'uCP_skin.fine_epithelial_similarity',
       'ORS.B1_skin.fine_epithelial_similarity',
       'ORS.B2_skin.fine_epithelial_similarity',
       'LPC_skin.fine_epithelial_similarity',
       'GL4_skin.fine_epithelial_similarity',
       'GL3_skin.fine_epithelial_similarity',
       'GL2_skin.fine_epithelial_similarity',
       'GL1_skin.fine_epithelial_similarity',
       'MED1_skin.fine_epithelial_similarity',
       'MED2_skin.fine_epithelial_similarity',
       'M