# Following [SpaGE Tutorial](https://github.com/tabdelaal/SpaGE/blob/master/SpaGE_Tutorial.ipynb)

Integration of *osmFISH* spatial data with the *AllenSSp* scRNA-seq data

In [2]:
import numpy as np
import pandas as pd
import loompy
import matplotlib.pyplot as plt

import scipy
import scipy.stats as st
from scipy import linalg
from scipy import sparse as sp
import pickle
from sklearn.metrics.pairwise import euclidean_distances

import warnings
warnings.filterwarnings('ignore')

In [3]:
path = '/Volumes/LaCie/school/combine_lab/SpaGE/SpaGE_Datasets/'

Load and preprocess the AllenSSp dataset

In [4]:
RNA_data = pd.read_csv(path + 'scRNAseq/Allen_SSp/SSp_exons_matrix.csv',header=0,index_col=0,sep=',')

# filter lowely expressed genes
Genes_count = np.sum(RNA_data > 0, axis=1)
RNA_data = RNA_data.loc[Genes_count >=10,:] # filter out genes expressed in <10 cells
del Genes_count

Random select a gene for testing at very end (will perform cross validation on the training set)

In [5]:
test_gene = RNA_data.sample(axis=0, random_state=42)
RNA_data_train = RNA_data.drop(test_gene.index) # remove gene from training data

# TODO: k-fold CV and LOOCV
see: https://www.thepythoncode.com/article/kfold-cross-validation-using-sklearn-in-python

In [6]:
# Normalization
def Log_Norm_cpm(x):
    return np.log(((x/np.sum(x))*1000000) + 1) # divide counts within each cell by the total number of transcripts within that cell, scale by 10^6 and log1p transformed.

RNA_ref_norm = RNA_data_train.apply(Log_Norm_cpm,axis=0)

In [7]:
# standardize gene expression 
RNA_ref = st.zscore(RNA_ref_norm)
training_mean = scipy.mean(RNA_ref_norm, axis=0)
training_std = st.tstd(RNA_ref_norm, axis=0)

# normalize test set using training mean and std
RNA_test = (test_gene - training_mean) / training_std

# import Adj mat 

In [11]:
# read in all pickle files
with open(path+'symmnorm_weighted_adj_mat_crs.pickle', 'rb') as filepath:
    while True:
        try:
            A = pickle.load(filepath)
        except EOFError:
            break    
                    
# uncompress Compressed Sparse Row matrix
adj = A.todense()
adj

matrix([[0.0000000e+00, 2.5932720e-09, 1.7135731e-07, ..., 2.1749336e-09,
         3.4555194e-09, 1.3278232e-09],
        [2.5932720e-09, 0.0000000e+00, 3.1704059e-09, ..., 6.5428489e-09,
         8.7725800e-09, 4.2454338e-09],
        [1.7135731e-07, 3.1704059e-09, 0.0000000e+00, ..., 2.3116113e-09,
         4.6899338e-09, 1.4021613e-09],
        ...,
        [2.1749336e-09, 6.5428489e-09, 2.3116113e-09, ..., 0.0000000e+00,
         2.1241020e-09, 2.7423431e-08],
        [3.4555194e-09, 8.7725800e-09, 4.6899338e-09, ..., 2.1241020e-09,
         0.0000000e+00, 1.5101378e-09],
        [1.3278232e-09, 4.2454338e-09, 1.4021613e-09, ..., 2.7423431e-08,
         1.5101378e-09, 0.0000000e+00]], dtype=float32)

# VGAE

import ST gene expression data

In [12]:
gene_exp = np.load(path+'ST_gene_express.npy')
gene_exp

array([[ 1.62137127,  1.77740124, -1.28549759, ..., -1.28549759,
        -0.23703214, -0.41726383],
       [ 2.72355252,  2.08806965,  0.16208098, ..., -0.95458504,
         0.01794291, -0.60954469],
       [ 2.45937972,  2.28947692,  1.11608695, ..., -0.95161135,
        -0.46612482, -0.29391208],
       ...,
       [ 0.23206104, -0.42366561, -1.34240285, ..., -1.34240285,
         0.23206104,  1.8484334 ],
       [ 0.63252484, -1.04108455, -1.04108455, ..., -1.04108455,
         0.63252484,  1.38918995],
       [ 0.64703313, -1.14930012, -1.14930012, ..., -1.14930012,
        -1.14930012,  1.58474506]])