# Use MPI and Scalapy to distribute all of MICA workflow to work on multiple nodes

prep.py component \
Read input file and slice into manageable sizes

In [1]:
from MICA.lib import utils

In [2]:
#from IPython import parallel
import ipyparallel as ipp

# Launch an ipython parallel cluster
Run this on hpc node to launch a cluster with mpi engines

In [3]:
#this currently needs to be launched from terminal
#import subprocess 
#We need to launch an ipython parallel cluster
#!ipcluster start --n=4
#subprocess.Popen(['ipcluster','start','--n=4'])
#subprocess.Popen(['ipcluster', 'start', '--engines=MPIEngineSetLauncher', '--log-level', 'DEBUG', '--n=4'])
#!ipcluster start --engines=MPIEngineSetLauncher --log-level DEBUG --n=4 &

In [4]:
# Create a parallel client so that we can use %%px cell magic
# With rc and dview, we can interact between mpi ranks and the thread running this notebook
rc = ipp.Client()
dview = rc[:]
rc.ids

[0, 1, 2, 3]

In [5]:
%%px
#load all necessary libraries onto each rank

from scipy.sparse import csr_matrix
from mpi4py import MPI
import sys
import numba
import pandas as pd
import scanpy as sc
import scipy as sci
import numpy as np
import anndata
import time
from sklearn.decomposition import PCA
import fast_histogram
import logging
logging.basicConfig(level=logging.INFO)
from MICA.lib import utils
from scalapy import *

In [6]:
%%px
import os
import socket
#from mpi4py import MPI
comm = MPI.COMM_WORLD
size = comm.Get_size()
rank = comm.Get_rank()
name = MPI.Get_processor_name()
#sys.stdout.write(
#    "Hello, World! I am process %d of %d on %s.\n" 
#    % (rank, size, name))

print("{host}[{pid}]: {rank}/{size}".format(
    host=socket.gethostname(),
    pid=os.getpid(),
    rank=comm.rank,
    size=comm.size,
))

[stdout:0] noderome167[41184]: 0/4
[stdout:1] noderome167[41185]: 1/4
[stdout:2] noderome167[41186]: 2/4
[stdout:3] noderome167[41187]: 3/4


In [7]:
%%px
#this is a defined elsewhere as a standalone python executable
#we are duplicating here to experiment with format for distrubuted computing
def prep_dist(input_file, out_name, slice_unit):
    """ Preprocess input file to create sliced matrices.

    Reads file into HDF5-format, adds parameters, slices data in file, generates several files with
    different combinations of slices.

    Args:
        input_file (str): path to input text-file
        out_name   (str): common rootname of generated output files
        slice_unit (int): size of each slice of cell data in input text-file
    """
    logging.basicConfig(level=logging.INFO)
    
    #Read in whole file stored in anndata csr format
    adf=utils.read_anndata_file(input_file)
    #if adf==None :
    #    raise Exception("Input file ",input_file," not found.")
        
    print("initial data size=",adf.shape)

    slice_size = int(slice_unit)
    
    #df = pd.HDFStore(df_file)["slice_0"]
    
    #compute number of slices needed to break dataset in to slice_size row blocks
    b = int(np.ceil(float(adf.shape[0]) / float(slice_size)))
    #determine how many digits are in b so we can pad spaces for the string output
    digit = int(np.floor(np.log10(b)) + 1)
    #loop over slice numbers
    for i in range(b):
        #slice name is equal to batch index
        slice_name = str(i).zfill(digit)
        #compute batch row indices
        start = i * slice_size
        end = np.min([(i + 1) * slice_size, adf.shape[0]])
        #copy slice to array of slices
        adf_sub=adf[start:end,:]
        #write to file so we don't have to keep each slice in memory
        output_file_name = out_name + ".slice_" + slice_name +".h5ad"
        print("output_file_name: ",output_file_name)
        adf_sub.write(output_file_name)
        
    #return nrows and nslices
    return adf.shape[0], adf.shape[1], b
    


In [8]:
%%px
#ceb create csr version of numba_histogram2d, also compute_bin with knowledge that minx will always be zero

@numba.jit(nopython=True)
def numba_nan_fill(x):
    shape = x.shape
    x = x.ravel()
    x[np.isnan(x)] = 0.0
    x = x.reshape(shape)
    return x

@numba.jit(nopython=True)
def numba_inf_fill(x):
    shape = x.shape
    x = x.ravel()
    x[np.isinf(x)] = 0.0
    x = x.reshape(shape)
    return x

@numba.jit(nopython=True, fastmath=True)
def compute_bin_upperbound(x, max, num_bins):
    """ Compute bin index for a give number.
        Assume that min is always zero
    """
    # special case to mirror NumPy behavior for last bin
    if x == max:
        return num_bins - 1 # a_max always in last bin

    bin = int(num_bins * x / max)

    if bin >= num_bins:
        return None
    else:
        return bin

@numba.jit(nopython=True, fastmath=True)
def numba_histogram2d_csr(arr1, cols1, arr2, cols2, ncols, num_bins):
    """ Compute the bi-dimensional histogram of two data samples.
    Args:
        arr1 (array_like, shape (N,)): An array containing the x coordinates of the points to be histogrammed.
        arr2 (array_like, shape (N,)): An array containing the y coordinates of the points to be histogrammed.
        num_bins (int): int
    Return:
        hist (2D ndarray)
    """
    #for csr arrays we have to compute zero bins ahead of time 
        
    bin_indices1 = np.zeros((ncols,), dtype=np.int16)
    max1 = arr1.max()
    #note that bin_indices has same size/indices as full array x and y
    for i, x in enumerate(arr1.flat):
        #assume zero min
        bin_indices1[cols1[i]] = compute_bin_upperbound(x, max1, num_bins)

    bin_indices2 = np.zeros((ncols,), dtype=np.int16)
    max2 = arr2.max()
    for i, y in enumerate(arr2.flat):
        #assume zero min
        bin_indices2[cols2[i]] = compute_bin_upperbound(y, max2, num_bins)

    hist = np.zeros((num_bins, num_bins), dtype=np.int16)
    for i, b in enumerate(bin_indices1):
        hist[b, bin_indices2[i]] += 1
        
    return hist

In [9]:
%%px
#import numba
@numba.jit(nopython=True, fastmath=True)
def numba_calc_mi_dis_csr(arr1, cols1, arr2, cols2, bins, m):
    """ Calculates a mutual information distance D(X, Y) = H(X, Y) - I(X, Y) using bin-based method

    It takes gene expression data from single cells, and compares them using standard calculation for
    mutual information and joint entropy. It builds a 2d histogram, which is used to calculate P(arr1, arr2).

    Args:
        arr1 (float) nparray of csr: gene expression data for cell 1
        cols1 (int): column indices for csr arr1
        arr2 (float) nparray of csr: gene expression data for cell 2
        cols2 (int): column indices for csr arr
        bins           (int): number of bins
        m              (int): number of genes
    Returns:
        a float between 0 and 1
    """
    hist = numba_histogram2d_csr(arr1, cols1, arr2, cols2, m, bins)
    
    sm = np.sum(hist, axis=1)
    tm = np.sum(hist, axis=0)
    sm = sm / float(sm.sum())
    tm = tm / float(tm.sum())

    sm_tm = np.zeros((bins, bins), dtype=np.float32)
    for i, s in enumerate(sm):
        for j, t in enumerate(tm):
            sm_tm[i, j] = s * t

    fq = hist / float(m)
    div = np.true_divide(fq, sm_tm)
    numba_nan_fill(div)
    ent = np.log(div)
    numba_inf_fill(ent)
    agg = np.multiply(fq, ent)
    #joint_ent = -np.multiply(fq, numba_inf_fill(np.log(fq))).sum()
    #return joint_ent - agg.sum()
    return agg.sum()

In [10]:
%%px
#numba compilation cannot interpret the creation of a 2d array inside of this function so we pass in and out SM_block instead of returning it
#import numba
@numba.jit(nopython=True, fastmath=True)
def process_matrices(Arows,Amat_data,Amat_indptr,Amat_indices,
                     Brows,Bmat_data,Bmat_indptr,Bmat_indices,
                     num_bins,num_genes,
                     SM_block, symmetric=False):
    #(mat1.n_obs,mat1.X.data,mat1.X.indptr,mat1.X.indices, mat2.n_obs,mat2.X.data,mat2.X.indptr,mat2.X.indices,SM,num_bins,mat2.n_vars,symmetric)

    #Arows = mat1.n_obs
    #Brows = mat2.n_obs
    #num_genes = mat1.n_vars
    
    #SM_block = np.ndarray(shape=(Arows,Brows))#, dtype=float, order='F')
    #SM_block = SM_block.reshape(Arows,Brows)
    for i in range(Arows):
        Arowstart = Amat_indptr[i]
        Arowend   = Amat_indptr[i+1]
        Arow_cols = Amat_indices[Arowstart:Arowend]
        Arow_data = Amat_data[Arowstart:Arowend]

        Bstart=0
        Bend=Brows
        #if(symmetric):Bstart=i #upper triangluar
        if(symmetric):Bend=i+1 #lower triangular
        for j in range(Bstart,Bend): 
            Browstart = Bmat_indptr[j]
            Browend   = Bmat_indptr[j+1]
            Brow_cols = Bmat_indices[Browstart:Browend]
            Brow_data = Bmat_data[Browstart:Browend]               
            SM_block[i,j] = numba_calc_mi_dis_csr(Arow_data, Arow_cols, Brow_data, Brow_cols, num_bins, num_genes)
            #SM_block[i*Arows+j] = numba_calc_mi_dis_csr(Arow_data, Arow_cols, Brow_data, Brow_cols, num_bins, num_genes)

            
    return #SM_block


In [11]:
%%px
#from mpi4py import MPI
from scipy.sparse import csr_matrix
#import sys
import pandas as pd

#create a 2d list to hold blocks of similarity matrix
#this should be a global var
#SM = [[None for j in range(b)] for i in range(b)] 

def calc_distance_metric_distributed(in_file_path, project_name, nrows, ncols, nslices, SM):
    
    """ Prepares the already sliced input file for further calculation in MICA.
    Enters pairs of slices (matrices) into temporary HDF5-format files. It enters them
    individually, using their unique key. It also enters the parameter data for every single 
    pair into the key "params", which consists of: [key1, key2, num_bins, num_genes,
    pair_index, project_name, num_slices]
    Args:
        in_file      (str): path to sliced HDF5-format file
        project_name (str): project name used to generate path for final outputs
        nrows : number of rows in global matrix
        ncols : number of vars in global matrix
    """

    
    #create a 2d list to hold blocks of similarity matrix
    #this should be a global var
    #SM = [[None for j in range(b)] for i in range(b)] 
    
    
    #input file is full input that has been segmented into b blocks of rows
    
    #nranks would ideally be equal to  b(b+1)/2
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    myrank = comm.Get_rank()
    name = MPI.Get_processor_name()
    #sys.stdout.write("Hello, World! I am process %d of %d on %s.\n" % (myrank, size, name))
    
    #in_ = pd.HDFStore(in_file, "r")  # slice.h5
    #b = int(np.ceil(float(nrows) / float(slice_size)))
    digit = int(np.floor(np.log10(nslices)) + 1)

    num_bins = int(np.floor(nrows ** (1 / 3.0)))  # number of bins
    #print("bins= ",bins)
    #b = in_["attr"].loc["slice", 0]  # number of sliced matrix
    b = nslices #number of row blocks (cells)
    m = ncols  # number of cols per row (genes)

    n_block_comparisons = int((b * (b + 1)) / 2)  # total number of row block comparisons needed to compute entire global similarity matrix
    n_jobs_per_rank = n_block_comparisons/size
    if (myrank == 0): print("block comparsons = %d. jobs per rank = %d\n" % (n_block_comparisons, n_jobs_per_rank))

    #build list of row block comparisons that current mpi rank will process
    myslices=[]
    for i in range(b):
        for j in range(i,b): # j in range [i,b]
            idx = int(i * b + j - (i * (i + 1)) / 2)
            targetrank = idx//n_jobs_per_rank
            if (targetrank == myrank): myslices.append((i,j))            

    #from list just generated, only do work assigned to your rank
    for index, tuple in enumerate(myslices):
        #print("tuple=",tuple)
        i=tuple[0] #block row
        j=tuple[1] #block col      

        #get 1st slice (row block) file
        slice_name = str(i).zfill(digit)
        ##ceb we only want to read this once per i,j combination
        input_file = in_file_path + project_name + ".slice_" + slice_name +".h5ad"
        #print("infile seg1: ",input_file)
        mat1 = utils.read_anndata_file(input_file)
    

        #get 2nd slice (row block) file
        slice_name = str(j).zfill(digit)
        input_file = in_file_path + project_name + ".slice_" + slice_name +".h5ad"
        #print("infile seg2: ",input_file)
        mat2 = utils.read_anndata_file(input_file)    

        #check to see if block comparison will result in a symmetric SM matrix
        # so that we can reduce the number of computations in half
        symmetric=False
        if i==j: symmetric=True
        
        print("rank: ",myrank," comparison between segs:",i," x ",j," symmetric=",symmetric)
            
        #compute distance metrics between row blocks
        
        Arows = mat1.n_obs
        Brows = mat2.n_obs
        num_genes = mat1.n_vars #we will assume Acols==Bcols==num_genes
        
        #need SM for each block pair    
        #creates local array of zeros and assigns to global 2d list
        #create matrix of zeros with row order indexing
        SM[i][j] = np.zeros(shape=(Arows,Brows), dtype = float, order = 'C')
 
        #This numba function cannot create a numpy array internally so we return SM[i,j] as a variable
        process_matrices(mat1.n_obs,mat1.X.data,mat1.X.indptr,mat1.X.indices, 
                         mat2.n_obs,mat2.X.data,mat2.X.indptr,mat2.X.indices,
                         num_bins, num_genes, SM[i][j],
                         symmetric #if i==j we can eliminate half of computations
                        )

        #convert to csr
        #we may want to assign this to scalapack distributed matrix here
        #SM[i][j]=csr_matrix(SM[i][j])

    return

In [12]:
%%px
import os
cwd=os.getcwd()
if rank==0:
    print(cwd)
    
data_file_path = cwd+'/test_data/inputs/10x/PBMC/3k/pre-processed/'
input_file_name = data_file_path + 'pbmc3k_preprocessed.h5ad'
project_name = 'pbmc3k'
output_file_name = data_file_path+project_name



[stdout:0] /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA


In [13]:
%%px
#set slice size (max size of row blocks)
slice_size = 500

In [14]:
%%px
if rank==0:
    print (input_file_name)

[stdout:0] /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k_preprocessed.h5ad


## Run Prep_dist() to split file into slices

In [15]:
%%px
#Run prep.py only on one processor to create the slice files

g_nrows=0 #global number of rows (cells)
ncols=0
nslices=0
if rank==0: 
    #for some unknown reason, running the prep_dist defined in MICA.utils stalls here, although they are identical
    g_nrows, ncols, nslices = prep_dist(input_file_name, output_file_name, slice_size)
    
# this uses the iparallel dview object to distribute these variables from the notebook thread to all mpi ranks
# but the ranks can't communicate back to the notebook thread
#dview.push(dict(nrows=nrows, ncols=ncols, nslices=nslices))

#broadcast resultant variables from root to the other ranks
g_nrows = comm.bcast(g_nrows, root=0)
ncols = comm.bcast(ncols, root=0)
nslices = comm.bcast(nslices, root=0)

[stdout:0] 
initial data size= (2496, 10499)
output_file_name:  /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k.slice_0.h5ad
output_file_name:  /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k.slice_1.h5ad
output_file_name:  /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k.slice_2.h5ad
output_file_name:  /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k.slice_3.h5ad
output_file_name:  /research/rgs01/home/clusterHome/cburdysh/MICA_Project/MICA_distributed/MICA/test_data/inputs/10x/PBMC/3k/pre-processed/pbmc3k.slice_4.h5ad


[stderr:0] 
  if not is_categorical(df_full[k]):
  if is_string_dtype(df[key]) and not is_categorical(df[key])


In [16]:
%%px
if rank==0:
    print("global nrows, ncols, slices: ",g_nrows, ncols, nslices)

[stdout:0] global nrows, ncols, slices:  2496 10499 5


## Read in anndata preprocessed files (in distributed mode, by node number) and calculate distance metrics between all row pairs


In [17]:
%%px
#create a 2d list to hold blocks of similarity matrix
#this should be stored in a distributed scalapack matrix
b=nslices #row blocks
SM = [[None for j in range(b)] for i in range(b)] 

start = time.time()
calc_distance_metric_distributed(data_file_path, project_name, g_nrows, ncols, nslices, SM)
end = time.time()
print("Elapsed = %s" % (end - start))


[stdout:0] 
block comparsons = 15. jobs per rank = 3

rank:  0  comparison between segs: 0  x  0  symmetric= True
rank:  0  comparison between segs: 0  x  1  symmetric= False
rank:  0  comparison between segs: 0  x  2  symmetric= False
rank:  0  comparison between segs: 0  x  3  symmetric= False
Elapsed = 30.804722785949707
[stdout:1] 
rank:  1  comparison between segs: 0  x  4  symmetric= False
rank:  1  comparison between segs: 1  x  1  symmetric= True
rank:  1  comparison between segs: 1  x  2  symmetric= False
rank:  1  comparison between segs: 1  x  3  symmetric= False
Elapsed = 30.672731399536133
[stdout:2] 
rank:  2  comparison between segs: 1  x  4  symmetric= False
rank:  2  comparison between segs: 2  x  2  symmetric= True
rank:  2  comparison between segs: 2  x  3  symmetric= False
rank:  2  comparison between segs: 2  x  4  symmetric= False
Elapsed = 30.68228793144226
[stdout:3] 
rank:  3  comparison between segs: 3  x  3  symmetric= True
rank:  3  comparison between segs: 

In [18]:
#%%px 
###from scipy.sparse import csr_matrix #may not use csr as it complicates copy to distributed scalapack and is not used in scalapack apparently
#import collections
#for i in range(b):
#    for j in range(i,b):
#        if isinstance(SM[i][j], collections.Iterable):
#            #print("Rank:",rank, " SM[",i,"][",j,"]=",SM[i][j])
#            print("SM[",i,"][",j,"]=",SM[i][j],"\n")

## Create distributed matrix for scalapack and copy distributed blocks into object
### This matrix needs to be dense for use in scalapack functions, so we will copy the symmetric data into both upper and lower triangular sections of the MI matrix

## copy lower triangular transpose to upper triangular for diagonal blocks

In [19]:
%%px 
##from scipy.sparse import csr_matrix #may not use csr as it complicates copy to distributed scalapack and is not used in scalapack apparently
import collections
for i in range(b):
    for j in range(i,b):
        if isinstance(SM[i][j], collections.Iterable):
            if i==j: #copy lower triangular transpose to upper triangular 
                for ii in range(SM[i][j].shape[0]):
                    for jj in range(ii+1,SM[i][j].shape[1]):
                        (SM[i][j])[ii,jj]=(SM[i][j])[jj,ii]
                #print("Rank:",rank, " SM[",i,"][",j,"]=",SM[i][j])

## Populate a global array with all of the MI data from each rank

Preferably, we would like each rank to contribute of their block MI matrices to the global matrix,
but currently the distributed global matrix has to be constructed from a global (not distributed) array

In [20]:
#copy SM data into global distributed matrix and then write to file?

#then we can read that file into the Scalapack block cyclic matrix form

In [23]:
%%px
#test to distribute matrix from local blocks rather than global array
from scalapy import blacs
import os
import numpy as np
import scipy.linalg as la
from mpi4py import MPI
from scalapy import core
import scalapy.routines as rt

#distribute MI components to ranks as scalapack distributed matrix
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size #total number of ranks

global_num_rows =g_nrows
global_num_cols =g_nrows
local_num_rows =g_nrows/b

block_size=64 #default is 32

#Define process grid with process rows and process cols
#We'll use a 2d process grid to distribute blocks so we want to have num_ranks divisible by 2
assert((size % 2)==0)
#ideally we would like BR and BC to the square root of the num_ranks to get a square process matrix
PR=int(np.sqrt(size))
PC=PR

#if we can't create a square matrix, get next best dimensions
if PR*PR!=size:
    PC=size//PR
if rank==0:
    print("PR=",PR, "PC=",PC)

#sets default context and block_shape
core.initmpi([PR, PC],block_shape=[block_size,block_size])
#convert to fortran array indexing to match scalapack functions
#create global matrix from array on rank0
dMI=core.DistributedMatrix(global_shape=[g_nrows,g_nrows],dtype=np.float64)


[stdout:0] PR= 2 PC= 2


In [24]:
%%px
#get global indices for diagonal
gi, lri, lci = dMI.local_diagonal_indices()

In [45]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI = %s' % (rank, dMI.global_shape))
print ('rank %d has local_shape of dMI = %s' % (rank, dMI.local_shape))
print ('rank %d has block_shape of dMI = %s' % (rank, dMI.block_shape))
#print(dMI.local_array[lri,lci])
print(dMI.local_array)

[stdout:0] 
rank 0 has global_shape of dMI = (2496, 2496)
rank 0 has local_shape of dMI = (1280, 1280)
rank 0 has block_shape of dMI = (64, 64)
[[0.33805914 0.066332   0.05676848 ... 0.05752099 0.04267657 0.0668972 ]
 [0.066332   0.45212405 0.06047588 ... 0.06336014 0.04781611 0.06866279]
 [0.05676848 0.06047588 0.39173321 ... 0.05280795 0.04746441 0.05657429]
 ...
 [0.05752099 0.06336014 0.05280795 ... 0.28514319 0.05104461 0.05835302]
 [0.04267657 0.04781611 0.04746441 ... 0.05104461 0.21451758 0.04515555]
 [0.0668972  0.06866279 0.05657429 ... 0.05835302 0.04515555 0.31753523]]
[stdout:1] 
rank 1 has local_shape of dMI = (1280, 1216)
rank 1 has block_shape of dMI = (64, 64)
[[0.04636978 0.05072926 0.05500222 ... 0.05035315 0.06835701 0.06148071]
 [0.05683409 0.05903542 0.05992358 ... 0.06264001 0.07618326 0.06856185]
 [0.04752714 0.04735661 0.05504157 ... 0.07159437 0.06133829 0.05748736]
 ...
 [0.04205094 0.0458999  0.04639795 ... 0.0493155  0.05951592 0.05640946]
 [0.0368094  0.03

In [90]:
#%%px
#blocksize=slice_size
#testrank=3
#testmat=np.zeros(shape=(4,4))
#if comm.rank==testrank:
#    testmat=SM[3][3]
#    #testmat=np.zeros(shape=(500,500))
#    s_block_shape=np.shape(testmat)
#else:
#    testmat=np.zeros(shape=(4,4))
#    s_block_shape=np.shape(testmat)

#s_block_shape = comm.bcast(s_block_shape, root=testrank)   
#copy_from_np(dMI2, testmat, asrow=0, anrow=None, ascol=0, ancol=None, srow=0, scol=0, block_shape=s_block_shape, rank=testrank) #all ranks works


## Copy each SM block submatrix to distributed block cyclic matrix

In [26]:
%%px
blocksize=slice_size
n_jobs_per_rank= (int((b * (b + 1)) / 2))/comm.Get_size()
import collections
for i in range(b):
    for j in range(i,b): # j in range [i,b]
        idx = int(i * b + j - (i * (i + 1)) / 2)
        srank = idx//n_jobs_per_rank
        lA=np.zeros(shape=(2,2))
        s_block_shape=np.shape(lA)
        if isinstance(SM[i][j], collections.Iterable):
            lA=SM[i][j]
            s_block_shape=np.shape(lA)
            #print("copy SM[",i,j,"] shape: ",s_block_shape)
        #broadcast sending ranks block shape to all
        s_block_shape = comm.bcast(s_block_shape, root=srank)   
        dMI.np2self(lA, srow=i*blocksize, scol=j*blocksize, block_shape=s_block_shape, rank=srank )      

In [27]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI = %s' % (rank, dMI.global_shape))
print ('rank %d has local_shape of dMI = %s' % (rank, dMI.local_shape))
print ('rank %d has block_shape of dMI = %s' % (rank, dMI.block_shape))
#print(dMI.local_array)
print(dMI.local_array[lri,lci])

[stdout:0] 
rank 0 has global_shape of dMI = (2496, 2496)
rank 0 has local_shape of dMI = (1280, 1280)
rank 0 has block_shape of dMI = (64, 64)
[0.33805914 0.45212405 0.39173321 ... 0.28514319 0.21451758 0.31753523]
[stdout:1] 
rank 1 has local_shape of dMI = (1280, 1216)
rank 1 has block_shape of dMI = (64, 64)
[]
[stdout:2] 
rank 2 has local_shape of dMI = (1216, 1280)
rank 2 has block_shape of dMI = (64, 64)
[]
[stdout:3] 
rank 3 has local_shape of dMI = (1216, 1216)
rank 3 has block_shape of dMI = (64, 64)
[0.35717489 0.2980473  0.3907872  ... 0.3529249  0.38132678 0.32226236]


## copy transpose of blocks to fill upper triangular distributed matrix

In [28]:
%%px
blocksize=slice_size
n_jobs_per_rank= (int((b * (b + 1)) / 2))/comm.Get_size()
import collections

for i in range(b):
    for j in range(i+1,b): # j in range [i,b]
        idx = int(i * b + j - (i * (i + 1)) / 2)
        srank = idx//n_jobs_per_rank
        lA=np.zeros(shape=(2,2))
        s_block_shape=np.shape(lA)
        if isinstance(SM[i][j], collections.Iterable):
            lA=np.transpose(SM[i][j])
            s_block_shape=np.shape(lA)
            #print("copy SM[",j,i,"] shape: ",s_block_shape)
        #broadcast sending ranks block shape to all
        s_block_shape = comm.bcast(s_block_shape, root=srank)   
        dMI.np2self(lA, srow=j*blocksize, scol=i*blocksize, block_shape=s_block_shape, rank=srank )      

In [95]:
## need to also fill in empty symmetric upper triangular portion

In [96]:
# Even though this is a symmetric matrix, for further processing, we need to copy block data to rest of matrix

In [29]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI = %s' % (rank, dMI.global_shape))
print ('rank %d has local_shape of dMI = %s' % (rank, dMI.local_shape))
print ('rank %d has block_shape of dMI = %s' % (rank, dMI.block_shape))
#print(dMI.local_array[0:20,0:20])
print(dMI.local_array)

[stdout:0] 
rank 0 has global_shape of dMI = (2496, 2496)
rank 0 has local_shape of dMI = (1280, 1280)
rank 0 has block_shape of dMI = (64, 64)
[[0.33805914 0.066332   0.05676848 ... 0.05752099 0.04267657 0.0668972 ]
 [0.066332   0.45212405 0.06047588 ... 0.06336014 0.04781611 0.06866279]
 [0.05676848 0.06047588 0.39173321 ... 0.05280795 0.04746441 0.05657429]
 ...
 [0.05752099 0.06336014 0.05280795 ... 0.28514319 0.05104461 0.05835302]
 [0.04267657 0.04781611 0.04746441 ... 0.05104461 0.21451758 0.04515555]
 [0.0668972  0.06866279 0.05657429 ... 0.05835302 0.04515555 0.31753523]]
[stdout:1] 
rank 1 has local_shape of dMI = (1280, 1216)
rank 1 has block_shape of dMI = (64, 64)
[[0.04636978 0.05072926 0.05500222 ... 0.05035315 0.06835701 0.06148071]
 [0.05683409 0.05903542 0.05992358 ... 0.06264001 0.07618326 0.06856185]
 [0.04752714 0.04735661 0.05504157 ... 0.07159437 0.06133829 0.05748736]
 ...
 [0.04205094 0.0458999  0.04639795 ... 0.0493155  0.05951592 0.05640946]
 [0.0368094  0.03

In [30]:
#redistribute for 

In [31]:
#%%px
###lr=[range(4)]
###lc=[range(4)]
####Check to see of data was transferred to rank 0
#if rank==0:
#    for i in range(b):
#            for j in range(i,b): # j in range [i,b]
#                #print("SM[",i,j,"] ",np.shape(SM[i][i]))
#                print("SM[",i,j,"] ",SM[i][j])
##                print( (SM[i][j])[lr,lc] )   

In [32]:
#%%px
#from inspect import getmembers, isfunction, ismodule
#if rank == 0:
#    print([o[0] for o in getmembers(scalapy) if ismodule(o[1])])

In [33]:
#%%px
#from inspect import getmembers, isfunction, ismodule
#import scalapy
#if rank==0:
#    print(getmembers(scalapy.blacs, isfunction))
#    #print(getmembers(scalapy.routines, isfunction))

In [34]:
#%%px
#total number of global blocks = total number of block comparisons
#Should be greater than number of ranks to improve load balancing
# b is number of slices (blocks of rows) original data has been discretized into.
#The size of these blocks can be variable
#global_number_of_matrix_blocks= int((b * (b + 1)) / 2) 

#We'll use a 2d process grid to distribute blocks so we want to have num_ranks divisivle by 2

In [35]:
#bcast SM[i][j] from each rank to root rank 0 so that we can load global matrix array


## Copy distributed MI matrix to file
### So we can read this in to Scalapack later on

In [36]:
%%px
#Write MI matrix to file
mi_filename = data_file_path+project_name+'_mi_distributed.scalapack'
dMI.to_file(mi_filename)

## The following code snippet reads MI matrix from a file and loads it into a distributed Scalapack matrix

In [37]:
#%%px
#Read MI matrix from file
#mi_filename = data_file_path+project_name+'_mi_distributed.scalapack'
#dMI.from_file(mi_filename, global_shape=[g_nrows,g_nrows], dtype=np.float64, block_shape=[block_size,block_size])

In [38]:
#%%px
#if rank==0: 
#    print ('rank %d has global_shape of dMI = %s' % (rank, dMI2.global_shape))
#print ('rank %d has local_shape of dMI = %s' % (rank, dMI2.local_shape))
#print ('rank %d has block_shape of dMI = %s' % (rank, dMI2.block_shape))
#print(dMI2.local_array)

## Now we need to create a matrix with the diagonal as the first column

In [109]:
%%px
#get global indices for diagonal
gi, lri, lci = dMI.local_diagonal_indices()

In [117]:
%%px
#create matrix to store diagonal row
dMI_diag=core.DistributedMatrix.empty_like(dMI)
dMI_row1=core.DistributedMatrix.empty_like(dMI)

In [118]:
%%px
dgi, dlri, dlci = dMI_diag.local_diagonal_indices()

In [119]:
%%px
dMI_diag.local_array[dlri,dlci]=dMI.local_array[lri,lci]
#dMI_diag.local_array[0,dlci]=dMI.local_array[lri,lci]
#my_diag[comm.rank]=dMI.local_array[lri,lci]

In [136]:
#%%px
#print(dMI_diag.local_array)

In [121]:
%%px
ri, ci = dMI_row1.indices()
dMI_row1.local_array[:]= ((ri==0).astype(int)).astype(float)
print(dMI_row1.local_array)

[stdout:0] 
[[1. 1. 1. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[stdout:1] 
[[1. 1. 1. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[stdout:2] 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[stdout:3] 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [127]:
%%px
#dMI_norm = rt.dot(dMI_diag,dMI_row1,transA='T')
dMI_norm = rt.dot(dMI_row1,dMI_diag)

#dMI_norm = dMI_diag.dot(dMI_row1)

In [128]:
%%px
print(dMI_norm.local_array)

[stdout:0] 
[[0.33805914 0.45212405 0.39173321 ... 0.28514319 0.21451758 0.31753523]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[stdout:1] 
[[0.35717489 0.2980473  0.3907872  ... 0.3529249  0.38132678 0.32226236]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]
[stdout:2] 
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 

In [131]:
%%px
import scalapy.routines as rt
#dMI_norm=dMI_diag.T*dMI_diag
dMI_norm2 = rt.dot(dMI_norm,dMI_norm,transA='T')


In [132]:
%%px
print(dMI_norm2.local_array)

[stdout:0] 
[[0.11428398 0.15284467 0.13242899 ... 0.09639526 0.07251963 0.10734569]
 [0.15284467 0.20441616 0.17711201 ... 0.12892009 0.09698856 0.14356532]
 [0.13242899 0.17711201 0.15345491 ... 0.11170006 0.08403366 0.1243891 ]
 ...
 [0.09639526 0.12892009 0.11170006 ... 0.08130664 0.06116823 0.09054301]
 [0.07251963 0.09698856 0.08403366 ... 0.06116823 0.04601779 0.06811689]
 [0.10734569 0.14356532 0.1243891  ... 0.09054301 0.06811689 0.10082862]]
[stdout:1] 
[[0.12074624 0.10075761 0.13210919 ... 0.11930949 0.128911   0.10894374]
 [0.16148736 0.13475435 0.17668429 ... 0.15956583 0.17240701 0.14570256]
 [0.13991727 0.11675502 0.15308433 ... 0.1382524  0.14937836 0.12624087]
 ...
 [0.10184599 0.08498616 0.11143031 ... 0.10063413 0.10873274 0.09189092]
 [0.07662029 0.06393638 0.08383072 ... 0.07570859 0.0818013  0.06913094]
 [0.11341561 0.09464052 0.1240887  ... 0.11206609 0.12108469 0.10232965]]
[stdout:2] 
[[0.12074624 0.16148736 0.13991727 ... 0.10184599 0.07662029 0.11341561]
 [0

In [64]:
#%%px
#if rank==0: 
#    print ('rank %d has global_shape of dMI_diag = %s' % (rank, dMI_diag.global_shape))
#print ('rank %d has local_shape of dMI_diag = %s' % (rank, dMI_diag.local_shape))
#print ('rank %d has block_shape of dMI_diag = %s' % (rank, dMI_diag.block_shape))
#print(dMI_diag.local_array)

In [None]:
#%%px
#blocksize=slice_size
#n_jobs_per_rank= (int((b * (b + 1)) / 2))/comm.Get_size()
#import collections
#for i in range(b):
#    #for j in range(i+1,b): # j in range [i,b]
#    #idx = int(i * b + j - (i * (i + 1)) / 2)
#    #srank = idx//n_jobs_per_rank
#    lA=
#    s_block_shape=np.shape(lA)
#    if isinstance(SM[i][j], collections.Iterable):
#        lA=local_diag
#        s_block_shape=np.shape(lA)
#    #broadcast sending ranks block shape to all
#    s_block_shape = comm.bcast(s_block_shape, root=srank)   
#    dMI_diag.np2self(lA, srow=i*blocksize, scol=0, block_shape=s_block_shape, rank=srank )  

In [65]:
#%%px
##Array must be 2D for this to work
##convert to fortran array indexing to match scalapack functions
#global_diag=np.asfortranarray(global_diag)
##create global matrix from array on rank0
#dMI_diag=core.DistributedMatrix.from_global_array(global_diag,rank=0)

In [None]:
#%%px
#if rank==0: 
#    print ('rank %d has global_shape of dMI_diag = %s' % (rank, dMI_diag.global_shape))
#print ('rank %d has local_shape of dMI_diag = %s' % (rank, dMI_diag.local_shape))
#print ('rank %d has block_shape of dMI_diag = %s' % (rank, dMI_diag.block_shape))
##print(dMI_diag.local_array)

In [None]:
#%%px
#dMI_diag_T=dMI_diag.T
#if rank==0: 
#    print ('rank %d has global_shape of dMI_diag = %s' % (rank, dMI_diag_T.global_shape))
#print ('rank %d has local_shape of dMI_diag = %s' % (rank, dMI_diag_T.local_shape))
#print ('rank %d has block_shape of dMI_diag = %s' % (rank, dMI_diag_T.block_shape))
#
#print(dMI_diag_T.local_array)

## Use scalapack to compute distributed GEMM

In [80]:
#%%px
#import scalapy.routines as rt
##dMI_norm=dMI_diag.T*dMI_diag
#dMI_norm = rt.dot(dMI_diag,dMI_diag,transA='T')


In [133]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI_diag = %s' % (rank, dMI_norm2.global_shape))
print ('rank %d has local_shape of dMI_diag = %s' % (rank, dMI_norm2.local_shape))
print ('rank %d has block_shape of dMI_diag = %s' % (rank, dMI_norm2.block_shape))
print(dMI_norm2.local_array)

[stdout:0] 
rank 0 has global_shape of dMI_diag = (2496, 2496)
rank 0 has local_shape of dMI_diag = (1280, 1280)
rank 0 has block_shape of dMI_diag = (64, 64)
[[0.11428398 0.15284467 0.13242899 ... 0.09639526 0.07251963 0.10734569]
 [0.15284467 0.20441616 0.17711201 ... 0.12892009 0.09698856 0.14356532]
 [0.13242899 0.17711201 0.15345491 ... 0.11170006 0.08403366 0.1243891 ]
 ...
 [0.09639526 0.12892009 0.11170006 ... 0.08130664 0.06116823 0.09054301]
 [0.07251963 0.09698856 0.08403366 ... 0.06116823 0.04601779 0.06811689]
 [0.10734569 0.14356532 0.1243891  ... 0.09054301 0.06811689 0.10082862]]
[stdout:1] 
rank 1 has local_shape of dMI_diag = (1280, 1216)
rank 1 has block_shape of dMI_diag = (64, 64)
[[0.12074624 0.10075761 0.13210919 ... 0.11930949 0.128911   0.10894374]
 [0.16148736 0.13475435 0.17668429 ... 0.15956583 0.17240701 0.14570256]
 [0.13991727 0.11675502 0.15308433 ... 0.1382524  0.14937836 0.12624087]
 ...
 [0.10184599 0.08498616 0.11143031 ... 0.10063413 0.10873274 0.09

In [134]:
%%px
#compute sqrt of each element
dMI_norm_square=core.DistributedMatrix.empty_like(dMI)
dMI_norm_square.local_array[:] = np.sqrt(dMI_norm2.local_array[:])

In [137]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI_diag = %s' % (rank, dMI_norm_square.global_shape))
print ('rank %d has local_shape of dMI_diag = %s' % (rank, dMI_norm_square.local_shape))
print ('rank %d has block_shape of dMI_diag = %s' % (rank, dMI_norm_square.block_shape))
print(dMI_norm_square.local_array)

[stdout:0] 
rank 0 has global_shape of dMI_diag = (2496, 2496)
rank 0 has local_shape of dMI_diag = (1280, 1280)
rank 0 has block_shape of dMI_diag = (64, 64)
[[0.33805914 0.39095354 0.36390795 ... 0.31047587 0.26929469 0.32763652]
 [0.39095354 0.45212405 0.42084677 ... 0.35905445 0.31142986 0.37890014]
 [0.36390795 0.42084677 0.39173321 ... 0.33421559 0.2898856  0.35268839]
 ...
 [0.31047587 0.35905445 0.33421559 ... 0.28514319 0.24732211 0.30090366]
 [0.26929469 0.31142986 0.2898856  ... 0.24732211 0.21451758 0.26099213]
 [0.32763652 0.37890014 0.35268839 ... 0.30090366 0.26099213 0.31753523]]
[stdout:1] 
rank 1 has local_shape of dMI_diag = (1280, 1216)
rank 1 has block_shape of dMI_diag = (64, 64)
[[0.34748559 0.3174234  0.36346827 ... 0.34541206 0.35904179 0.33006626]
 [0.4018549  0.36708902 0.42033831 ... 0.39945692 0.41521923 0.38171005]
 [0.37405517 0.34169434 0.39125992 ... 0.37182308 0.38649497 0.35530391]
 ...
 [0.31913318 0.29152385 0.33381179 ... 0.31722883 0.32974647 0.30

In [138]:
%%px
dMI_normed=core.DistributedMatrix.empty_like(dMI)
dMI_normed.local_array[:] = dMI.local_array[:] / dMI_norm_square.local_array[:]

In [139]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI_diag = %s' % (rank, dMI_normed.global_shape))
print ('rank %d has local_shape of dMI_diag = %s' % (rank, dMI_normed.local_shape))
print ('rank %d has block_shape of dMI_diag = %s' % (rank, dMI_normed.block_shape))
print(dMI_normed.local_array)

[stdout:0] 
rank 0 has global_shape of dMI_diag = (2496, 2496)
rank 0 has local_shape of dMI_diag = (1280, 1280)
rank 0 has block_shape of dMI_diag = (64, 64)
[[1.         0.16966721 0.15599682 ... 0.18526718 0.15847534 0.20418114]
 [0.16966721 1.         0.14370048 ... 0.17646388 0.15353732 0.18121605]
 [0.15599682 0.14370048 1.         ... 0.15800564 0.16373497 0.16040872]
 ...
 [0.18526718 0.17646388 0.15800564 ... 1.         0.2063892  0.19392593]
 [0.15847534 0.15353732 0.16373497 ... 0.2063892  1.         0.17301498]
 [0.20418114 0.18121605 0.16040872 ... 0.19392593 0.17301498 1.        ]]
[stdout:1] 
rank 1 has local_shape of dMI_diag = (1280, 1216)
rank 1 has block_shape of dMI_diag = (64, 64)
[[0.13344376 0.15981576 0.15132607 ... 0.14577705 0.19038733 0.18626778]
 [0.14142939 0.16082045 0.14256036 ... 0.15681292 0.1834772  0.17961762]
 [0.12705919 0.13859349 0.14067777 ... 0.19254956 0.15870398 0.16179771]
 ...
 [0.13176611 0.15744818 0.13899435 ... 0.15545719 0.18048995 0.18

In [140]:
%%px
mi_normed_filename = data_file_path+project_name+'_mi_normed_distributed.scalapack'
dMI_normed.to_file(mi_normed_filename)

## Now compute eigenvalues and eigenvectors of dissimmilarity matrix

In [None]:
#def mds(in_mat_file, max_dim, out_file_name, perplexity=30, print_plot="True", dist_method="mi",):
#    hdf = pd.HDFStore(in_mat_file)
#    if dist_method == "mi":
#        dlplf = 1 - hdf["norm_mi"]
#    elif dist_method == "euclidean":
#        df = hdf[dist_method]
#    else:
#        df = 1 - hdf[dist_method]
#
#    hdf.close()
#    n = df.shape[0]
#    H = np.eye(n) - np.ones((n, n)) / n
#    B = -H.dot(df ** 2).dot(H) / 2
#    evals, evecs = eigh(B, eigvals=(n - np.min([n, 200]), n - 1))
#    
#    idx = np.argsort(evals)[::-1]
#    evals = evals[idx]
#    evecs = evecs[:, idx]
#    evals_pos = evals > 0
#    L = np.diag(np.sqrt(evals[evals_pos]))
#    V = evecs[:, evals_pos]
#    Y = pd.DataFrame(
#        data=V.dot(L),
#        index=df.index,
#        columns=["mds_" + str(x) for x in np.arange(1, L.shape[0] + 1)],
#    )
#
#    Y.to_hdf(out_file_name + "_reduced.h5", "mds")  # save reduced mi in mds
#
#    if print_plot == "True":
#        vis = tsne(
#            Y,
#            max_dim,
#            out_file_name,
#            "mds",
#            perplexity,
#            print_plot,
#        )
#        vis.to_hdf(out_file_name + "_reduced", "mds_tsne")  # save preview in key "mds_tsne"

In [141]:
%%px
import scalapy.routines as rt

n= g_nrows

#convert similarity matrix to dissimilarity matrix
#df= 1-df
MDS= core.DistributedMatrix.empty_like(dMI)
MDS.local_array[:]=1.0-dMI_normed.local_array[:]

# H = I-Ones/n
I= core.DistributedMatrix.identity(n=g_nrows)
Ones= core.DistributedMatrix.empty_like(dMI)
Ones.local_array[:]=1.0/n
H = core.DistributedMatrix.empty_like(dMI)
H.local_array[:] = I.local_array[:] - Ones.local_array[:]

# B = -H.dot(MDS**2).dot(H)/2
negH= core.DistributedMatrix.empty_like(dMI)
negH.local_array[:]= -H.local_array[:]
MDS2= core.DistributedMatrix.empty_like(dMI)
MDS2.local_array[:] = MDS.local_array[:]**2
C=rt.dot(negH,MDS2)
B = rt.dot(C,H)
B.local_array[:]=B.local_array[:]/2.0
#dMI_norm=dMI_diag.T*dMI_diag
#dMI_norm = rt.dot(dMI_diag,dMI_diag,transA='T')


In [142]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI_diag = %s' % (rank, MDS.global_shape))
print ('rank %d has local_shape of dMI_diag = %s' % (rank, MDS.local_shape))
print ('rank %d has block_shape of dMI_diag = %s' % (rank, MDS.block_shape))
print(MDS.local_array)

[stdout:0] 
rank 0 has global_shape of dMI_diag = (2496, 2496)
rank 0 has local_shape of dMI_diag = (1280, 1280)
rank 0 has block_shape of dMI_diag = (64, 64)
[[0.         0.83033279 0.84400318 ... 0.81473282 0.84152466 0.79581886]
 [0.83033279 0.         0.85629952 ... 0.82353612 0.84646268 0.81878395]
 [0.84400318 0.85629952 0.         ... 0.84199436 0.83626503 0.83959128]
 ...
 [0.81473282 0.82353612 0.84199436 ... 0.         0.7936108  0.80607407]
 [0.84152466 0.84646268 0.83626503 ... 0.7936108  0.         0.82698502]
 [0.79581886 0.81878395 0.83959128 ... 0.80607407 0.82698502 0.        ]]
[stdout:1] 
rank 1 has local_shape of dMI_diag = (1280, 1216)
rank 1 has block_shape of dMI_diag = (64, 64)
[[0.86655624 0.84018424 0.84867393 ... 0.85422295 0.80961267 0.81373222]
 [0.85857061 0.83917955 0.85743964 ... 0.84318708 0.8165228  0.82038238]
 [0.87294081 0.86140651 0.85932223 ... 0.80745044 0.84129602 0.83820229]
 ...
 [0.86823389 0.84255182 0.86100565 ... 0.84454281 0.81951005 0.81

In [143]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI_diag = %s' % (rank, MDS2.global_shape))
print ('rank %d has local_shape of dMI_diag = %s' % (rank, MDS2.local_shape))
print ('rank %d has block_shape of dMI_diag = %s' % (rank, MDS2.block_shape))
print(MDS2.local_array)

[stdout:0] 
rank 0 has global_shape of dMI_diag = (2496, 2496)
rank 0 has local_shape of dMI_diag = (1280, 1280)
rank 0 has block_shape of dMI_diag = (64, 64)
[[0.         0.68945254 0.71234137 ... 0.66378957 0.70816375 0.63332766]
 [0.68945254 0.         0.73324887 ... 0.67821173 0.71649907 0.67040716]
 [0.71234137 0.73324887 0.         ... 0.7089545  0.6993392  0.70491351]
 ...
 [0.66378957 0.67821173 0.7089545  ... 0.         0.6298181  0.6497554 ]
 [0.70816375 0.71649907 0.6993392  ... 0.6298181  0.         0.68390422]
 [0.63332766 0.67040716 0.70491351 ... 0.6497554  0.68390422 0.        ]]
[stdout:1] 
rank 1 has local_shape of dMI_diag = (1280, 1216)
rank 1 has block_shape of dMI_diag = (64, 64)
[[0.75091973 0.70590956 0.72024745 ... 0.72969685 0.65547268 0.66216012]
 [0.73714349 0.70422232 0.73520274 ... 0.71096445 0.66670948 0.67302724]
 [0.76202567 0.74202118 0.73843469 ... 0.65197621 0.707779   0.70258308]
 ...
 [0.75383009 0.70989358 0.74133073 ... 0.71325257 0.67159672 0.66

In [144]:
%%px
if rank==0: 
    print ('rank %d has global_shape of dMI_diag = %s' % (rank, B.global_shape))
print ('rank %d has local_shape of dMI_diag = %s' % (rank, B.local_shape))
print ('rank %d has block_shape of dMI_diag = %s' % (rank, B.block_shape))
print(B.local_array)

[stdout:0] 
rank 0 has global_shape of dMI_diag = (2496, 2496)
rank 0 has local_shape of dMI_diag = (1280, 1280)
rank 0 has block_shape of dMI_diag = (64, 64)
[[ 3.32606572e-01 -4.48879650e-03 -1.22124044e-02 ... -4.99538265e-04
  -9.33603291e-03  1.19542557e-02]
 [-4.48879650e-03  3.47868371e-01 -1.50352510e-02 ... -7.97225127e-05
  -5.87279447e-03  1.04540298e-03]
 [-1.22124044e-02 -1.50352510e-02  3.55309993e-01 ... -1.17302924e-02
   6.42794877e-03 -1.24869591e-02]
 ...
 [-4.99538265e-04 -7.97225127e-05 -1.17302924e-02 ...  3.30183917e-01
   2.86254621e-02  2.52905849e-03]
 [-9.33603291e-03 -5.87279447e-03  6.42794877e-03 ...  2.86254621e-02
   3.56885108e-01 -1.19475721e-03]
 [ 1.19542557e-02  1.04540298e-03 -1.24869591e-02 ...  2.52905849e-03
  -1.19475721e-03  3.24629599e-01]]
[stdout:1] 
rank 1 has local_shape of dMI_diag = (1280, 1216)
rank 1 has block_shape of dMI_diag = (64, 64)
[[-0.01057509 -0.00223418 -0.00174018 ... -0.01561031  0.00304875
   0.0050489 ]
 [ 0.00394393  0

In [147]:
%%px
import scalapy.routines as rt
#compute eigh(B,)
#we want to pick out the top 200 eigenvalues/vectors from the matrix
#evals, evecs = eigh(B, eigvals=(n - np.min([n, 200]), n - 1))

#evals, dZd = rt.eigh(B,overwrite_a=False)
#returns same evals np.array to all ranks
# and distributed evecs matrix dZd
evals, dZd = rt.eigh(B,overwrite_a=False,eigvals=(n - np.min([n, 200]), n - 1))
#copy evecs to root
evecs = dZd.to_global_array(rank=0)
#gZd = dZd.to_global_array(rank=0)

In [150]:
%%px
if rank==0:
    print(evals)
    #print(evecs)

[stdout:0] 
[ 0.63653653  0.63718665  0.63856906  0.63913447  0.63951631  0.64141837
  0.6416083   0.64199858  0.64309684  0.64515302  0.64600987  0.6465693
  0.64770702  0.64962061  0.65130261  0.65158366  0.65353592  0.65458884
  0.65490567  0.65566407  0.65607526  0.65771841  0.65880259  0.65932652
  0.66112078  0.66144263  0.66239989  0.66347846  0.66480679  0.66515495
  0.6666297   0.66794033  0.66959897  0.67151719  0.67174352  0.67227765
  0.67398626  0.67444886  0.67525269  0.67737147  0.67810749  0.67928807
  0.68064238  0.68127193  0.68181287  0.68417543  0.68510449  0.68661375
  0.68782991  0.68945798  0.68971988  0.6917769   0.69289193  0.69435186
  0.69544161  0.69610655  0.69730158  0.69856377  0.70052196  0.70165805
  0.70250321  0.70330761  0.70428105  0.7061929   0.70717293  0.70730084
  0.70940083  0.71110775  0.71227849  0.71410865  0.71577565  0.7172199
  0.71812038  0.71923984  0.72120975  0.72229011  0.72406995  0.72500372
  0.72573429  0.72668299  0.72920854  0.7

In [None]:
#gather the top 200 eigenvalues on a single rank

In [152]:
%%px
if rank==0:
    idx = np.argsort(evals)[::-1]
    evals = evals[idx]
    evecs = evecs[:, idx]
    evals_pos = evals > 0
    L = np.diag(np.sqrt(evals[evals_pos]))
    V = evecs[:, evals_pos]
    print(V)
    #Y = pd.DataFrame(
    #    data=V.dot(L),
    #    index=df.index, #need to reattach index names to eigenvectors
    #    columns=["mds_" + str(x) for x in np.arange(1, L.shape[0] + 1)],
    #)
    #print(Y) 
#Y.to_hdf(out_file_name + "_reduced.h5", "mds")  # save reduced mi in mds

[stdout:0] 
[[ 0.01468645 -0.00223302  0.01106707 ...  0.01339326 -0.00432337
  -0.00438385]
 [ 0.00943586 -0.01715544  0.01569153 ...  0.00192537 -0.01729476
  -0.03001981]
 [-0.03448515  0.00729943  0.00454717 ... -0.00297281  0.03522983
  -0.00213711]
 ...
 [ 0.00795561  0.04249614 -0.03216508 ... -0.02224412 -0.02813351
   0.02922512]
 [ 0.00059434  0.02982748 -0.04455422 ...  0.01702971 -0.02594927
   0.01674737]
 [ 0.0166286   0.00551013  0.0196269  ...  0.02222502  0.00051805
   0.02786173]]


In [None]:
#%%px
#import scalapy.routines as rt
##obtain dissimilarity matrix
#n= g_nrows
##MDS=1-dMI
#MDS= core.DistributedMatrix.empty_like(dMI)
##MDS.local_array[:]=1.0-dMI.local_Array[:]
#I= core.DistributedMatrix.identity(n=g_nrows)
#Ones= core.DistributedMatrix.empty_like(dMI)
#Ones.local_array[:]=1.0/n
##create arrays
##create identity matrix and then subtract 1/n from all entries
#H= np.eye(n) - np.ones((n, n)) / n
#H= I - Ones
#B= -H.dot(df ** 2).dot(H) / 2
#
##compute eigh(B,)
##we want to pick out the top 200 eigenvalues/vectors from the matrix
##evals, evecs = eigh(B, eigvals=(n - np.min([n, 200]), n - 1))
#evals, dZd = rt.eigh(B,overwrite_a=False)
#evecs = dZd.to_global_array(rank=0)

In [None]:
#%%px
#dgi, dlri, dlci = diagMI.local_diagonal_indices()

In [None]:
#%%px
#gi, lri, lci = dMI.local_diagonal_indices()

In [None]:
#%%px
#diagMI.local_array[lri,0] = dMI.local_array[lri,lci]

In [None]:
#%%px
#print ('rank %d has local_shape of dMI = %s' % (rank, diagMI.local_shape))
#print ('rank %d has block_shape of dMI = %s' % (rank, diagMI.block_shape))
#print(diagMI.local_array)

In [None]:
#%%px
#diagMI_dot = dot(dMI_diag,dMI_diag,transB='T')
#diagMI_dot = dMI_diag.T * dMI_diag

#diagMI_dot.local_array[lri,lci] = 1.0/np.sqrt(diagMI_dot.local_array[lri,lci])

In [None]:
#%%px
#print(diagMI_dot.local_array)

In [None]:
#%%px
#print(diagMI_dot.local_array)

In [None]:
#%%px
#diagMI_normed = dMI*diagMI_dot

In [None]:
#%%px
#print(dMI_normed.local_array)

In [None]:
#%%px
##lr=[range(2400,2495)]
##lc=[range(2400,2495)]
#print(len(dMI.local_array[lri,lci]))
#diag=dMI.local_array[lri,lci]
#print(diag)

In [None]:
#%%px
#dMI_dot=dot(dMI,dMI,transB='T')

In [None]:
#%%px
#dMI_dot.desc

In [None]:
#%%px
#dgi, dlri, dlci = dMI_dot.local_diagonal_indices()
#print(dMI_dot.local_array[dlri,dlci])

In [None]:
#%%px
#Note these values are in mixed order due to block cyclic partition. last rank last block does not correspond to last values in global diagonal
#print(diag)


In [None]:
%%px
if rank==0:
    print(evals)

In [None]:
%%px
if rank==0:
    nnzgZd=
    print(len(gZd))
    print(gZd)

In [None]:
#compute eigenvalues,vectors
#evals1, evecs1 = scalapy.eigh(dMI, overwrite_a=False)

## Write distributed matrix to files

In [None]:
## test function to sort out copy_from_np
%%px
from mpi4py import MPI
from scalapy import *

def _chk_2d_size(shape, positive=True):
    # Check that the shape describes a valid 2D grid. Zero shape not allowed when positive = True.
    if len(shape) != 2:
        return False
    if positive:
        if shape[0] <= 0 or shape[1] <= 0:
            return False
    else:
        if shape[0] < 0 or shape[1] < 0:
            return False
    return True


def copy_from_np(dmat, a, asrow=0, anrow=None, ascol=0, ancol=None, srow=0, scol=0, block_shape=None, rank=0):
        ## copy a section of a numpy array a[asrow:asrow+anrow, ascol:ascol+ancol] to self[srow:srow+anrow, scol:scol+ancol], once per block_shape

        Nrow, Ncol = dmat.global_shape
        srow = srow if srow >= 0 else srow + Nrow
        srow = max(0, srow)
        srow = min(srow, Nrow)
        scol = scol if scol >= 0 else scol + Ncol
        scol = max(0, scol)
        scol = min(scol, Ncol)
        if dmat.context.mpi_comm.rank == rank:
            if not (a.ndim == 1 or a.ndim == 2):
                raise ScalapyException('Unsupported high dimensional array.')

            a = np.asfortranarray(a.astype(dmat.dtype)) # type conversion
            a = a.reshape(-1, a.shape[-1]) # reshape to two dimensional
            am, an = a.shape
            asrow = asrow if asrow >= 0 else asrow + am
            asrow = max(0, asrow)
            asrow = min(asrow, am)
            ascol = ascol if ascol >= 0 else ascol + an
            ascol = max(0, ascol)
            ascol = min(ascol, an)
            m = am - asrow if anrow is None else anrow
            m = max(0, m)
            m = min(m, am - asrow, Nrow - srow)
            n = an - ascol if ancol is None else ancol
            n = max(0, n)
            n = min(n, an - ascol, Ncol - scol)
        else:
            m, n = 1, 1

        asrow = dmat.context.mpi_comm.bcast(asrow, root=rank)
        ascol = dmat.context.mpi_comm.bcast(ascol, root=rank)
        #print("asrow ascol=",asrow,ascol)
        #get sending ranks size
        m = dmat.context.mpi_comm.bcast(m, root=rank) # number of rows to copy
        n = dmat.context.mpi_comm.bcast(n, root=rank) # number of columes to copy
        #print("m,n=",m,n)
        if m == 0 or n == 0:
            return dmat

        block_shape = dmat.block_shape if block_shape is None else block_shape
        if not _chk_2d_size(block_shape):
            raise ScalapyException("Invalid block_shape")
        #print("block_shape=",block_shape)
        bm, bn = block_shape
        br = blockcyclic.num_blocks(m, bm) # number of blocks for row
        bc = blockcyclic.num_blocks(n, bn) # number of blocks for column
        rm = m - (br - 1) * bm # remained number of rows of the last block
        rn = n - (bc - 1) * bn # remained number of columes of the last block
        #print("bm,bn=",bm,bn)

        # due to bugs in scalapy, it is needed to first init an process context here
        ProcessContext([1, dmat.context.mpi_comm.size], comm=dmat.context.mpi_comm) # process context
        #print("br,bc=",br,bc)
        
        for bri in range(br):
            M = bm if bri != br - 1 else rm
            for bci in range(bc):
                N = bn if bci != bc - 1 else rn
                if dmat.context.mpi_comm.rank == rank:
                    #print("main rank=",dmat.context.mpi_comm.rank," bm bn: ",bm,bn," bri bci: ",bri,bci,"srow scol: ",srow,scol)
                    pc = ProcessContext([1, 1], comm=MPI.COMM_SELF) # process context
                    desc = dmat.desc
                    desc[1] = pc.blacs_context #new context containing only this rank
                    desc[2], desc[3] = a.shape
                    desc[4], desc[5] = a.shape
                    desc[8] = a.shape[0]
                    #print("descA= ",desc)
                    #print("descB= ",dmat.desc)

                    args = [M, N, a,                              asrow+1+bm*bri, ascol+1+bn*bci, desc,   
                            dmat._local_array, srow+1+bm*bri, scol+1+bn*bci, dmat.desc, dmat.context.blacs_context]
                else:#these processes do not own submatrix, but must still call function
                    #print("other rank=",dmat.context.mpi_comm.rank," bm bn: ",bm,bn," bri bci: ",bri,bci,"srow scol: ",srow,scol)
                    desc = np.zeros(9, dtype=np.int32)
                    desc[1] = -1           
                    #print("descA= ",desc)
                    #print("descB= ",dmat.desc)
                    args = [M, N, np.zeros(1, dtype=dmat.dtype) , asrow+1+bm*bri, ascol+1+bn*bci, desc,   
                            dmat._local_array, srow+1+bm*bri, scol+1+bn*bci, dmat.desc, dmat.context.blacs_context]
                from scalapy import lowlevel as ll
                call_table = {'S': (ll.psgemr2d, args),
                              'D': (ll.pdgemr2d, args),
                              'C': (ll.pcgemr2d, args),
                              'Z': (ll.pzgemr2d, args)}
                func, args = call_table[dmat.sc_dtype]
                #print (args)
                func(*args)
        return dmat

In [None]:
##formerly used to copy to a global array
%%px
#copy SM[i][j] from each rank to root rank 0
n_jobs_per_rank= (int((b * (b + 1)) / 2))/comm.Get_size()
import collections
for i in range(b):
    for j in range(i,b): # j in range [i,b]
        idx = int(i * b + j - (i * (i + 1)) / 2)
        srank = idx//n_jobs_per_rank
        if rank!=0 and isinstance(SM[i][j], collections.Iterable):
            #print("send[",i,j,"] tag: ",idx)
            comm.send(SM[i][j],dest=0,tag=idx)
        elif rank==0 and not isinstance(SM[i][j], collections.Iterable):
            #MB=np.zeros(shape=(Arows,Brows), dtype = float, order = 'C')
            MB=comm.recv( source=srank, tag=idx)
            SM[i][j]=MB
            #print("recv[",i,j,"] tag: ",idx," shape: ",MB.shape, SM[i][j])
            

In [None]:
#create and fill distributed matrix from global array
%%px
from scalapy import blacs
import os
import numpy as np
import scipy.linalg as la
from mpi4py import MPI
from scalapy import core
import scalapy.routines as rt

#distribute MI components to ranks as scalapack distributed matrix
comm = MPI.COMM_WORLD
rank = comm.rank
size = comm.size #total number of ranks

global_num_rows =g_nrows
global_num_cols =g_nrows
local_num_rows =g_nrows/b

block_size=64 #default is 32

#Define process grid with process rows and process cols
#We'll use a 2d process grid to distribute blocks so we want to have num_ranks divisible by 2
assert((size % 2)==0)
#ideally we would like BR and BC to the square root of the num_ranks to get a square process matrix
PR=int(np.sqrt(size))
PC=PR

#if we can't create a square matrix, get next best dimensions
if PR*PR!=size:
    PC=size//PR
if rank==0:
    print("PR=",PR, "PC=",PC)

#create mpi comm context to send to blacs communicator    
#process context
#do I need this?
#cntxt = blacs.sys2blacs_handle(comm)
#initialize a process grid to distribute matrix blocks to.
#blacs.gridinit(ctxt,order='R',BR,BC)
#blacs.gridinit(pc,order='R',BR,BC)
#core.initmpi([PR, PC], block_shape=[local_num_rows,local_num_rows])
#instead of creating a global array on every processor, we want to load a 
#global array with local data
#MI=core.DistributedMatrix(global_shape=[global_num_rows,global_num_cols])


core.initmpi([PR, PC],block_shape=[block_size,block_size])

#convert to fortran array indexing to match scalapack functions
global_MI=np.asfortranarray(global_MI)
#create global matrix from array on rank0
dMI=core.DistributedMatrix.from_global_array(global_MI,rank=0)

#Array must be 2D, so this fails
#convert to fortran array indexing to match scalapack functions
#global_diag=np.asfortranarray(global_diag)
#create global matrix from array on rank0
#dMI_diag=core.DistributedMatrix.from_global_array(global_diag,rank=0)




#blacs.gridinit(ctxt, b, b)
#ranklist = [(0, 0), 
#            (0, 1), (1, 1),
#            (0, 2), (1, 2), (2, 2),
#            (0, 3), (1, 3), (2, 3), (3, 3),
#            (0, 4), (1, 4), (2, 4), (3, 4), (4, 4)
#           ]