In [1]:
import numpy as np

import scipy.sparse
import pickle
import igraph

In [2]:
%cd ..

/home/kuba/Projects/github_search


## NOTE

Pairwise matrix should contain distances between nodes

In [3]:
call_graph_i = pickle.load(open("data/call_igraph.pkl", 'rb'))

In [4]:
call_graph_i.vcount()

198517

## Graph distances

Because we have almost 200k nodes, running shortest paths algorithms would result in 200k x 200k matrix.
This is infeasible matrix if its dtype is float64 which is usually result of running such algorithms.

Because of this we need to run batched shortest paths algorithm. The result is then stored in *uint8* matrix, which is over 30GB, so machines 

In [5]:
import tqdm


def get_distance_matrix(igraph, batch_size, dtype='uint8'):
    n = igraph.vcount()
    dists = np.zeros((n, n), dtype=dtype)
    
    for i in tqdm.tqdm(range(0, n-batch_size, batch_size)):
        i_dists = igraph.shortest_paths(np.arange(i, i + batch_size))
        dists[i:i+batch_size] = np.array(i_dists).astype('uint8')
    dists[i:n] = np.array(igraph.shortest_paths(np.arange(i, n))).astype('uint8')
    return dists

In [6]:
ig = igraph.Graph()
ig.add_vertices(10)
ig.add_edges([(i, i+1) for i in range(9)])

In [None]:
dists = get_distance_matrix(call_graph_i, 1000)

  0%|          | 0/198 [00:00<?, ?it/s]

In [None]:
n = dists.shape[0]

In [None]:
import numpy as np
import h5py
import dask.dataframe as dd

In [None]:
f = h5py.File("data/call_graph_dists.hdf5", 'r')
dset = f.create_dataset("dists", (n, n), dtype='uint8', data=dists)

In [None]:
dset = f['dists']

In [None]:
%%time
dists_dd = dd.from_array(dset, chunksize=5000)

In [None]:
%%time
hcos_dists_dd = dists_dd.map_partitions(lambda x: - np.cosh(x))

In [None]:
from dask_ml.decomposition import PCA as DaskPCA
from dask_ml.decomposition import IncrementalPCA as DaskIncrementalPCA
from dask_ml.decomposition import TruncatedSVD as DaskTruncatedSVD

ipca = DaskIncrementalPCA(n_components=10, svd_solver='randomized')

In [None]:
hcos_dists_dd.to_dask_array()

In [None]:
import tqdm
import pickle

In [None]:
for i in tqdm.tqdm(range(hcos_dists_dd.npartitions)):
    ipca.partial_fit(hcos_dists_dd.get_partition(i).to_dask_array(lengths=True).astype('float32'))

In [None]:
pickle.dump(ipca, open("data/ipca.pkl", "wb"))

In [None]:
%%time
ipca.transform(hcos_dists_dd.to_dask_array(lengths=True))

In [None]:
f.close()