# Following [SpaGE Tutorial](https://github.com/tabdelaal/SpaGE/blob/master/SpaGE_Tutorial.ipynb)

Integration of *osmFISH* spatial data with the *AllenSSp* scRNA-seq data

In [1]:
import numpy as np
import pandas as pd
import loompy
import matplotlib.pyplot as plt

import scipy
import scipy.stats as st
from scipy import linalg
from scipy import sparse as sp
import pickle
from sklearn.metrics.pairwise import euclidean_distances

import warnings
warnings.filterwarnings('ignore')

In [2]:
path = '/Volumes/LaCie/school/combine_lab/SpaGE/SpaGE_Datasets/'

Load and preprocess the AllenSSp dataset

In [None]:
RNA_data = pd.read_csv(path + 'scRNAseq/Allen_SSp/SSp_exons_matrix.csv',header=0,index_col=0,sep=',')

# filter lowely expressed genes
Genes_count = np.sum(RNA_data > 0, axis=1)
RNA_data = RNA_data.loc[Genes_count >=10,:] # filter out genes expressed in <10 cells
del Genes_count

Random select a gene for testing at very end (will perform cross validation on the training set)

In [None]:
test_gene = RNA_data.sample(axis=0, random_state=42)
RNA_data_train = RNA_data.drop(test_gene.index) # remove gene from training data

# TODO: k-fold CV and LOOCV
see: https://www.thepythoncode.com/article/kfold-cross-validation-using-sklearn-in-python

In [None]:
# Normalization
def Log_Norm_cpm(x):
    return np.log(((x/np.sum(x))*1000000) + 1) # divide counts within each cell by the total number of transcripts within that cell, scale by 10^6 and log1p transformed.

RNA_ref_norm = RNA_data_train.apply(Log_Norm_cpm,axis=0)

In [None]:
# standardize gene expression 
RNA_ref = st.zscore(RNA_ref_norm)
training_mean = scipy.mean(RNA_ref_norm, axis=0)
training_std = st.tstd(RNA_ref_norm, axis=0)

# normalize test set using training mean and std
RNA_test = (test_gene - training_mean) / training_std

# import spatial data

In [None]:
ds = loompy.connect(path + 'Spatial/osmFISH/osmFISH_SScortex_mouse_all_cells.loom')
FISH_Genes = ds.ra['Gene']   
colAtr = ds.ca.keys()

df = pd.DataFrame()
for i in colAtr:
    df[i] = ds.ca[i]

osmFISH_meta = df.iloc[np.where(df.Valid == 1)[0], :]
osmFISH_data = ds[:,:]
osmFISH_data = osmFISH_data[:,np.where(df.Valid == 1)[0]]
osmFISH_data = pd.DataFrame(data= osmFISH_data, index= FISH_Genes)

del ds, colAtr, i, df, FISH_Genes

# Select cortical regions only to match the AllenSSp dataset
Cortex_Regions = ['Layer 2-3 lateral', 'Layer 2-3 medial', 'Layer 3-4', 
                  'Layer 4','Layer 5', 'Layer 6', 'Pia Layer 1']
Cortical = np.stack(i in Cortex_Regions for i in osmFISH_meta.Region)

osmFISH_meta = osmFISH_meta.iloc[Cortical,:]
osmFISH_data = osmFISH_data.iloc[:,Cortical]
del Cortex_Regions,Cortical

In [None]:
osmFISH_meta

In [None]:
# Normalization
cell_count = np.sum(osmFISH_data,axis=0)
def Log_Norm_spatial(x):
    return np.log(((x/np.sum(x))*np.median(cell_count)) + 1)

osmFISH_data_norm = osmFISH_data.apply(Log_Norm_spatial,axis=0)

In [None]:
# scale
ST = st.zscore(osmFISH_data_norm)
ST

Import saved Adjacency matrices:

In [40]:
import os 

A = []
# read in all pickle files
for i, file in enumerate([path for path in os.listdir(path) if path.endswith('pickle')]):
    print(i, file)
    for i in [path+file]:
        with open(i, 'rb') as filepath:
            while True:
                try:
                    A.append(pickle.load(filepath))
                except EOFError:
                    break    
                    
# uncompress Compressed Sparse Row matrix
adj_mat = []
for i in range(len(A)):
    adj_mat.append(A[i].todense())

# get number of counts for all connected cells of cell i
sum_neighb_cell = []
for file in range(len(adj_mat)):
     sum_neighb_cell.append([(adj_mat[file][i][np.where(adj_mat[file][i] != 0)].size) for i in range(len(adj_mat[file]))])

# get average cell neighbourhood by threshold
for i in range(len(adj_mat)):
    avg_cell_neib = sum(sum_neighb_cell[i])/len(adj_mat[0])
    print('average cell neighborhood for file', i,':', avg_cell_neib)

0 adj_mat_crs_100.pickle
1 adj_mat_crs_140.pickle
2 adj_mat_crs_180.pickle
3 adj_mat_crs_210.pickle
4 adj_mat_crs_220.pickle
5 adj_mat_crs_260.pickle
6 adj_mat_crs_300.pickle
7 adj_mat_crs_340.pickle
8 adj_mat_crs_380.pickle
9 adj_mat_crs_400.pickle
average cell neighborhood for file 0 : 0.15036710719530103
average cell neighborhood for file 1 : 0.34772393538913365
average cell neighborhood for file 2 : 0.6807635829662262
average cell neighborhood for file 3 : 0.9985315712187959
average cell neighborhood for file 4 : 1.1236417033773862
average cell neighborhood for file 5 : 1.7527165932452275
average cell neighborhood for file 6 : 2.4176211453744494
average cell neighborhood for file 7 : 3.2052863436123347
average cell neighborhood for file 8 : 4.108076358296622
average cell neighborhood for file 9 : 4.59265785609398


GNCG [supp](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-020-02214-w#Sec16)

seqFISH+ avg cell neighbourhood for different distance threshold:
- 100: 1.2
- 140: 2.5 <-- threshold with the best validation performace
- 180: 4.0
- 220: 5.8

MERFISH:
- 130: 1.03 <-- threshold with best validation performance
- 150: 1.52
- 170: 2.05
- 200: 2.95
- 240: 4.25

Try thresholds:
- 220: 1.12
- 300: 2.42

In [41]:
A200 = adj_mat[4]
A300 = adj_mat[6]