# standard NMF with consensus clustering

**NMF_indNeuro environment**

# <span style="color:red">standard NMF</span> - kernel: cnmf_env

# <span style="color:red">vRG_to_IPC - 4k Polioudakis 2019</span>

### Loading pcw 16 data

In [None]:
import scanpy as sc

import pandas as pd

import seaborn as sns

import numpy as np

import matplotlib.pyplot as plt

# visualization settings
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600

## cNMF

In [None]:
%matplotlib inline

import os
from scipy.io import mmread

import scipy.sparse as sp
from IPython.display import Image
from cnmf import cNMF

np.random.seed(14)

In [None]:
!mkdir /home/jovyan/jm_jlab/data_indNeuro/3.NMF_data/standardNMF_polioudakis19_vRGtoIPC_4k

In [None]:
numiter=  750 # Number of NMF replicates. Recommended ~200 for real data
numhvgenes= 4000 

countfn = '/home/jovyan/jm_jlab/data_indNeuro/2.NMF/polioudakis19_vRGtoIPC_4k.h5ad'

## Results will be saved to [output_directory]/[run_name]
output_directory = '/home/jovyan/jm_jlab/data_indNeuro/3.NMF_data/standardNMF_polioudakis19_vRGtoIPC_4k'
if not os.path.exists(output_directory):
    os.mkdir(output_directory)
run_name = 'standardNMF_polioudakis19_vRGtoIPC_4k'

seed = 14 ## Specify a seed pseudorandom number generation for reproducibility

In [None]:
## Initialize
cnmf_obj = cNMF(output_dir=output_directory, name=run_name)

In [None]:
cnmf_obj.prepare(counts_fn=countfn, components=np.arange(3,7), n_iter=numiter, seed=seed, num_highvar_genes=numhvgenes)

In [None]:
# Specify that the jobs are being distributed over a single worker (total_workers=1) and then launch that worker
cnmf_obj.factorize(worker_i=0, total_workers=1)

In [None]:
cnmf_obj.combine(skip_missing_files=True)

In [None]:
cnmf_obj.k_selection_plot(close_fig=False)

In [None]:
print('This saves the corresponding figure to the following file: %s' % cnmf_obj.paths['k_selection_plot'])

In [None]:
selected_K = 4
density_threshold = 2.00

In [None]:
cnmf_obj.consensus(k=selected_K, 
                   density_threshold=density_threshold, 
                   show_clustering=True, 
                   close_clustergram_fig=False, refit_usage=False)

In [None]:
adata = sc.read(countfn)

adata.obsm['X_pca'] = np.empty([adata.shape[0], 2])
adata.obsm['X_pca'][:,0] = adata.obs['PC_1']
adata.obsm['X_pca'][:,1] = adata.obs['PC_2']

In [None]:
# Load high variance genes that used for cNMF
hvgs = open(output_directory+"/"+run_name+'/standardNMF_polioudakis19_vRGtoIPC_4k.overdispersed_genes.txt').read().split('\n')

In [None]:
len(hvgs)

In [None]:
usage_norm, gep_scores, gep_tpm, topgenes = cnmf_obj.load_results(K=selected_K, density_threshold=density_threshold)
usage_norm.columns = ['Usage_%d' % i for i in usage_norm.columns]

In [None]:
print("Number of HVGs used: %d" %len(hvgs))
print("Number of genes per module: %d " %topgenes.shape[0])

In [None]:
sc.set_figure_params(figsize=(12,6))

sns.heatmap(usage_norm.transpose(), xticklabels=False, yticklabels=True, cmap='mako')

In [None]:
#For pseudotemporal ordering in heatmap
usage_norm.rename(columns={'Usage_1':"Module_4", 'Usage_2':"Module_2", 'Usage_3':"Module_1", 'Usage_4':"Module_3"}, inplace=True)
topgenes.rename(columns={1:"Module_4", 2:"Module_2", 3:"Module_1", 4:"Module_3"}, inplace=True)

In [None]:
usage_norm = usage_norm[['Module_1','Module_2','Module_3','Module_4']]
topgenes = topgenes[['Module_1','Module_2','Module_3','Module_4']]

In [None]:
sc.set_figure_params(figsize=(12,6))

pl1 = sns.heatmap(usage_norm.transpose(), xticklabels=False, yticklabels=True, cmap='mako')
pl1.set(xlabel="pseudotime") 


figure = pl1.get_figure()    
figure.savefig(output_directory+"/"+run_name+"/modules_to_IPC_4K_standardNMF.png", dpi=400)

**PCA plots**

In [None]:
usage_norm.index = np.arange(0, usage_norm.shape[0])

In [None]:
adata.obs.index = np.arange(0, adata.obs.shape[0])

In [None]:
adata.obs = pd.merge(left=adata.obs, right=usage_norm, how='left', left_index=True, right_index=True)

In [None]:
sc.set_figure_params(figsize=(4,4))

sc.pl.pca(adata, color=usage_norm.columns, ncols=1)

In [None]:
sc.settings.figdir = output_directory+"/"+run_name

In [None]:
sc.set_figure_params(figsize=(4,4))

sc.pl.pca(adata, color=usage_norm.columns, ncols=1, save='modules_to_IPC_4K_standardNMF.png')

In [None]:
from gprofiler import GProfiler

In [None]:
gp = GProfiler(return_dataframe=True)

In [None]:
topgenes.shape

In [None]:
for j in topgenes.columns:
    
    print("")

    print("GO analysis for %s" %j)
    tmp = gp.profile(organism='hsapiens', 
               query=topgenes[j].tolist(), no_evidences=False)
    for i in tmp['source'].value_counts().index:
        if ("GO" not in i) and ("TF" not in i):
            print(i+"____")
            print(tmp[tmp['source'] == i]['name'])

In [None]:
writer = pd.ExcelWriter(output_directory+"/"+run_name+"/modules_to_IPC_Polioudakis2019_4K_GOenrich_standardNMF.xlsx", engine='xlsxwriter')

for j in topgenes.columns:

    print("%s" %j)
    
    tmp = gp.profile(organism='hsapiens', 
               query=topgenes[j].tolist(), no_evidences=False)
    tmp.to_excel(writer, sheet_name=str(j))

writer.close()

In [None]:
topgenes.to_csv(output_directory+"/"+run_name+"/modules_to_IPC_topgenes_Polioudakis2019_4K_standardNMF.tsv", sep='\t', index=False, header=True)

In [None]:
usage_norm.to_csv(output_directory+"/"+run_name+'/modules_to_IPC_activation_Polioudakis2019_4K_standardNMF.tsv', sep='\t', index=False, header=True)