In [1]:
import anndata
from collections import defaultdict
import copy
import csv
from joblib import Parallel, delayed
from matplotlib import colors
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from matplotlib import style
from mpl_toolkits.mplot3d import Axes3D
import multiprocessing
import networkx as nx
import numba
import numpy as np
import numpy.random as rnd
import os
import pandas as pd
import pickle
import random
from random import choices
import re
import scipy as scp
import scipy.integrate as integrate
from scipy.special import hyp2f1 as hyper
import scipy.stats as stats
from scipy.stats import norm as normal
import scvelo as scv
from scvelo.tools.velocity_embedding import quiver_autoscale,velocity_embedding
import seaborn as sns
from sklearn import preprocessing
from sklearn.cluster import Birch
import sklearn.decomposition as skd
from sklearn.neighbors import NearestNeighbors
import string
import umap
import skbio as sk
scv.settings.verbosity = 0

## Functions

### Utilities

In [2]:
def load_adata(file):
    
    # INPUT
    # file = path to file containing a pickle object
    # OUTPUT
    # AnnData object
    
    with open(file, 'rb') as inF:
        obj = pickle.load(inF)
        
        return(obj)
    
def save_adata(obj, filename):
    
    # IPUT
    # obj = python object
    # filename = path to save object

    with open(filename, 'wb') as output: 
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
    
def unique(list1): 
    
    # INPUT
    # list1 = python list
    # OUTPUT:
    # numpy array with unique elements in the list
    
    x = np.array(list1) 
    return(np.unique(x))

### Distances

In [3]:
def rescale(df):
    
    # INPUT
    # df =  distance matrix data frame
    # OUTPUT
    # scaled_df = scaled distance matrix
    
    scaled_df = df
    values = []
    for i in range(0,len(df.index)):
        for j in range(i,len(df.columns)):
            values.append(df.iloc[i,j])

    min_max_scaler = preprocessing.MinMaxScaler()
    x_scaled = min_max_scaler.fit_transform(np.array(values).reshape(-1, 1))

    k=0
    for i in range(0,len(df.index)):
        for j in range(i,len(df.columns)):
            scaled_df.iloc[i,j]=x_scaled[k]
            scaled_df.iloc[j,i]=x_scaled[k]
            k=k+1
    return(scaled_df)

def expression_distance(adata,resc=True,copy=False):
    
    # INPUT
    # adata - AnnData object
    # clustcol - name of the column used to cluster cells 
    # resc - whether to normalize and rescale distances (recommended)
    # copy - whether a copy of the distance matrix should be returned. By default copy=True and adata.uns['expression_distances'] is updated

    clusters = [c for c in adata.obs.dropna()[clustcol].unique() if c!='nan']
    centroids = [np.array(np.mean(adata.layers['spliced'][(adata.obs[clustcol] == c).values,:],axis=0)[:,].tolist()[0]) for c in clusters]


    nc = len(centroids)
    dist = pd.DataFrame(0,index=range(0,nc),columns=range(0,nc),dtype=np.float64)
    for i in range(0,nc):
        for j in range(i,nc):
            dist.at[i,j] = np.linalg.norm(centroids[i] - centroids[j])
            dist.at[j,i] = dist.at[i,j]
            
    if(resc):
        dist = rescale(dist)
        
    dist = sk.DistanceMatrix(dist.values)
    
    if copy==False:
        adata.uns['expression_distances'] = dist
    else:
        return(dist)

def preprocess(adata):
    
    # Compute cluster distances 
    
    expression_distance(adata)
    
    # Remove genes with NaN's in velocity
        
    V = adata.layers["velocity"]
    genes_valid = adata.var[['velocity_genes']].iloc[np.where(np.logical_not(np.isnan(V.sum(axis=0))))[0].tolist(),:].index
    adata_valid = adata[:,genes_valid]
    adata_valid.var['gtype'] = 'Velocity_not_nan'
    
    return(adata)

### Network inference

In [63]:
def rank_genes(V,X,g,n):
    
    # INPUT
    # V = velocity matric for the cells in a specific cluster
    # X = expression matrix for the cells in a specific cluster
    # n = number of genes to choose
    # m = str ranking method. Options:
    #   * absvel = gene ranking based on the mean (across cells) absolute value of velocity 
    #   * topvel = gene ranking based on the mean (across cells) value of velocity (including sign)
    #   * stdvel = gene ranking based on decreasing standard deviation of velocity across cells
    #   * random = gene set selected at random
    #   * stdexp = gene ranking basedon decreasing standard deviation of expression across cells
    #   * highexp = gene ranking basedon decreasing expression level across cells
    # OUTPUT
    # genes = list with genes selected (based on the cluster matrix dimensions) 
    
    if(g == 'absvel'):
        gset = V.dropna().abs().mean(0).sort_values(ascending=False)[0:n].index.tolist()
    elif(g == 'topvel'):
        gset = V.dropna().mean(0).sort_values(ascending=False)[0:n].index.tolist()
    elif(g == 'stdvel'):
        gset = V.dropna().std(0).sort_values(ascending=False).index.tolist()[0:n]  
    elif(g == 'stdexp'):        
        gset = X.dropna().std(0).sort_values(ascending=False).index.tolist()[0:n]
    elif(g == 'highexp'):
        gset = X.dropna().mean(0).sort_values(ascending=False).index.tolist()[0:n]

    return(gset)


def predict_network(adata,cluster,genes,network_size,copy=False):

    ## INPUT 
    
    # clustcol - cluster column used
    # cluster - individual cluster label
    # genes - either a str (vrank,maxstd,topvar,abstop,random) or a list of genes (same as in adata.var.index) 
    # network_size - number of genes used to infer the network
    # copy - whether a copy of the network should be returned. By default copy=True 
    
    # Cluster cells 
    
    ind = adata.obs[clustcol] == cluster
    V = pd.DataFrame(adata.layers["velocity"][ind.values,:],columns=adata.var.index)    
    X = pd.DataFrame(adata.layers['spliced'][ind.values,:].todense(),columns=adata.var.index)

    # Get  genes
    
    if isinstance(genes,str):
        geneset = rank_genes(V,X,genes,network_size)
    else:
        geneset = [g for g in genes if g in X.columns]
        genes = 'manual'
    tag = 'W-' + genes + '-' + str(network_size)
    
    # Infer networks
    
    Xc = X.loc[:,geneset] 
    Xpinv = np.linalg.pinv(Xc) 
    Vc = V.loc[:,geneset] 
    Gf = np.diag(adata.var.fit_gamma.loc[geneset,]) 
    W = np.nan_to_num(np.dot(Xpinv,(Vc + np.dot(Xc,Gf))),nan=0) 
    W = pd.DataFrame(np.array(W,dtype=np.float64),index=geneset,columns=geneset)
    
    if copy==False:
        if tag in adata.uns.keys():
            adata.uns[tag][cluster] = W
        else:
            adata.uns[tag] = defaultdict(pd.DataFrame)
            adata.uns[tag][cluster] = W
    else:
        return(W)

### Gene selection

Functions to run testing of user-specified combinations of gene sets and network sizes. Networks are not stored at this step, only the list of the top `n` combinations (according to the mantel correlation will be stored in adata.uns as `top_gene_sets`.

In [5]:
def select_network_mode(adata,clustcol,genes,network_size):
    
    
    clusts = [c for c in adata.obs.dropna()[clustcol].unique() if c!='nan']
    W_list = [predict_network(adata,clustcol,c,genes,network_size,copy=True) for c in clusts]
    
    # Calculate mantel correlation between distance matrices
    net_dist = cdm_computeDistance(W_list,resc=True,dis_type='euclidean')
    coeff,p_value,_ = sk.stats.distance.mantel(net_dist,adata.uns['cluster_centroid_distances'])
    
    results = [dset_name,method,ngenes,exp_tf,coeff,p_value]
    
    return(results)

### Network perturbations

## Pipeline

#### Set input parameters

In [71]:
global dset,clustcol
dset = 'hFB18'
clustcol = 'labels'

#### Load data

In [72]:
adata_dir = "/Users/larisamorales/Documents/KAUST/scgrn-project/objects/"
adata_path = adata_dir + "scvelo/" + dset + "-adata.pkl"
adata = load_adata(adata_path)

#### Preprocessing

- Compute cluster expression distances
- Remove genes with NaN's in velocity

In [69]:
adata = preprocess(adata)
clusters = set(adata.obs[clustcol])

Trying to set attribute `.var` of view, copying.


#### Infer networks

In [70]:
for c in clusters:
    predict_network(adata,cluster=c,genes='highexp',network_size=100,copy=False)