In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

## Methods
### Condensed sparse column matrices

In [None]:
data = np.array([1, 2, 3, 4, 5, 6])
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
sp.sparse.csc_matrix((data, (row, col)), shape=(3, 3)).toarray()

In [None]:
# topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
#           'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
#           'energy', 'optics', 'earth science', 'geology', 'meteorology']
topics = ['earth science']

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
len(networks[topic].graph.nodes)

In [None]:
v = networks[topic].graph.graph['tfidf']
v

In [None]:
v.sum()

In [None]:
v[:,0].indices[:5]

In [None]:
v[4,0]

In [None]:
networks[topic].graph.name

In [None]:
networks[topic].graph.nodes['Biology']

In [None]:
core = [n for n in networks[topic].graph.nodes if networks[topic].graph.nodes[n]['core_rb']>.9]
core

In [None]:
[(i,n) for i,n in enumerate(networks[topic].graph.nodes) if networks[topic].graph.nodes[n]['year']<-1800]

In [None]:
vi = v[:,9]
vi

### CSC & networkx operations

In [None]:
graph = networks[topic].graph

In [None]:
core = [n for n in networks[topic].graph.nodes if networks[topic].graph.nodes[n]['year']<-2000]
subgraph = graph.subgraph(core).copy()

In [None]:
import scipy.sparse as ss

In [None]:
tfidf = ss.hstack([v[:,list(graph.nodes).index(n)] for n in subgraph.nodes])
tfidf

In [None]:
subgraph.nodes

In [None]:
subgraph.add_node('Hello')

In [None]:
subgraph.nodes

## Algorithm

Initialize with core set of nodes.\
For each year,\
initialize an "baby" node for each existing node that doesn't already have a baby node,\
mutate tf-idf for each "baby" node (including the name),\
and if the "baby" node gets a probability drawn from the distribution of similarities (to what?),
the "baby" node is born.

In [None]:
import sklearn.metrics.pairwise as smp
import scipy.sparse as ss
from scipy.stats import norm

### Mutation

#### Prior: power law distributions of weights

In [None]:
import seaborn as sn
graph = networks[topic].graph
tfidf = graph.graph['tfidf']
x = tfidf[:,1].copy()

In [None]:
import powerlaw
fit = powerlaw.Fit(x.data)
fit.xmin, fit.alpha

In [None]:
fit.plot_pdf()
fit.power_law.plot_pdf()

#### Prior: new words / year between neighbors
[gist](https://gist.github.com/ptocca/e18a9e4e35930c0958fdaa62958bdf82)

In [None]:
graph = networks[topic].graph
year_diffs = [[graph.nodes[node]['year'] - graph.nodes[neighbor]['year']
               for neighbor in list(graph.successors(node))]
               # + list(graph.predecessors(node))]
              for node in graph.nodes]
year_diffs = [y for ys in year_diffs for y in ys]

In [None]:
sns.distplot(year_diffs)
plt.title(topic)
plt.xlabel('year difference');

In [None]:
import sklearn.preprocessing as skp
import sklearn.metrics.pairwise as smp

In [None]:
skp.binarize(tfidf[:,0:2]).transpose()

In [None]:
%load_ext cython

In [None]:
%%cython -f

import numpy as np
cimport numpy as np
from cython cimport floating,boundscheck,wraparound
from cython.parallel import prange

from libc.math cimport fabs

np.import_array()

#@boundscheck(False)  # Deactivate bounds checking
@wraparound(False)
def cython_manhattan(floating[::1] X_data, int[:] X_indices, int[:] X_indptr,
                     floating[::1] Y_data, int[:] Y_indices, int[:] Y_indptr,
                     double[:, ::1] D):
    """Pairwise L1 distances for CSR matrices.
    Usage:
    >>> D = np.zeros(X.shape[0], Y.shape[0])
    >>> cython_manhattan(X.data, X.indices, X.indptr,
    ...                  Y.data, Y.indices, Y.indptr,
    ...                  D)
    """
    cdef np.npy_intp px, py, i, j, ix, iy
    cdef double d = 0.0
    
    cdef int m = D.shape[0]
    cdef int n = D.shape[1]
    
    with nogil:                          
        for px in prange(m):
            for py in range(n):
                i = X_indptr[px]
                j = Y_indptr[py]
                d = 0.0
                while i < X_indptr[px+1] and j < Y_indptr[py+1]:
                    if i < X_indptr[px+1]: ix = X_indices[i]
                    if j < Y_indptr[py+1]: iy = Y_indices[j]
                    
                    if ix==iy:
                        d = d+fabs(X_data[i]-Y_data[j])
                        i = i+1
                        j = j+1
                    
                    elif ix<iy:
                        d = d+fabs(X_data[i])
                        i = i+1
                    else:
                        d = d+fabs(Y_data[j])
                        j = j+1
                
                if i== X_indptr[px+1]:
                    while j < Y_indptr[py+1]:
                        iy = Y_indices[j]
                        d = d+fabs(Y_data[j])
                        j = j+1                                            
                else:
                    while i < X_indptr[px+1]:
                        ix = X_indices[i]
                        d = d+fabs(X_data[i])
                        i = i+1
                        
                D[px,py] = d

In [None]:
from scipy.sparse import csr_matrix,random
from sklearn.metrics.pairwise import check_pairwise_arrays
def sparse_manhattan(X,Y=None):
    X, Y = check_pairwise_arrays(X, Y)
    X = csr_matrix(X, copy=False)
    Y = csr_matrix(Y, copy=False)
    res = np.empty(shape=(X.shape[0],Y.shape[0]))
    cython_manhattan(X.data,X.indices,X.indptr,
                     Y.data,Y.indices,Y.indptr,
                             res)
    return res

In [None]:
dists = sparse_manhattan(X=sp.binarize(tfidf).transpose())
dists

In [None]:
nodes = list(graph.nodes)
word_diffs = [[dists[nodes.index(node), nodes.index(neighbor)]
               for neighbor in list(graph.successors(node))]
              for node in graph.nodes]
word_diffs = [w for ws in word_diffs for w in ws]

In [None]:
sns.scatterplot(x=np.abs(year_diffs), y=word_diffs)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(year_diffs), word_diffs)
x = np.linspace(0, max(year_diffs), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.4f}; r={r:.4f}; p={p:.4f}")
plt.xlabel('year')
plt.ylabel('manhattan distance');

In [None]:
sns.distplot(word_diffs)
mu, std = sp.stats.norm.fit(word_diffs)
x = np.linspace(min(word_diffs), max(word_diffs), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.xlabel('manhattan distance')
plt.ylabel('probability distribution');

#### Prior: similarity / year between neighbors

In [None]:
nodes = list(graph.nodes)
neighbors = [[smp.cosine_similarity(v[:,nodes.index(node)].transpose(),
                                    v[:,nodes.index(neighbor)].transpose())[0,0]
              for neighbor in list(graph.successors(node))]# + list(graph.predecessors(node))]
             for node in nodes]
neighbors = [s for ss in neighbors for s in ss]

In [None]:
sns.scatterplot(x=np.abs(year_diffs), y=neighbors)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(year_diffs), neighbors)
x = np.linspace(0, max(year_diffs), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.4f}; r={r:.4f}; p={p:.4f}")
plt.xlabel('year')
plt.ylabel('');

#### Method

In [None]:
import numpy.random as npr

def mutate(x, rvs=lambda: npr.rand(), p_insert=.3, p_delete=.3, p_point=1):
    """
    
    Parameters
    ----------
    x: spipy.sparse.csc_matrix
    rvs: lambda ()-> float
        returns a random distance
    """
    data = x.data
    idx = x.indices
    if npr.rand() < p_point:
        data[npr.choice(x.size)] = rvs()
    if npr.rand() < p_delete:
        delete_idx = npr.choice(idx.size)
        idx = np.delete(idx, delete_idx)
        data = np.delete(data, delete_idx)
    if npr.rand() < p_insert:
        while True:
            insert_idx = npr.choice(x.shape[0])
            if insert_idx not in idx: break
        idx = np.append(idx, insert_idx)
        data = np.append(data, rvs())
    y = ss.csc_matrix((data, (idx, np.zeros(idx.shape, dtype=int))),
                      shape=x.shape)
    return y

#### Test

In [None]:
x = tfidf[:,1].copy()
y = tfidf[:,1].copy()

In [None]:
T = 100
sim = np.zeros(T)
size = np.zeros(T)
for i in range(sim.size):
    sim[i] = smp.cosine_similarity(x.transpose(),y.transpose())[0,0]
    size[i] = y.size
    y = mutate(y, lambda: fit.power_law.generate_random()[0])
plt.figure()
sn.lineplot(x=range(sim.size), y=sim)
plt.title(graph.name)
plt.ylabel('similarity')
plt.xlabel('years');
plt.figure()
sn.lineplot(x=range(sim.size), y=size)
plt.title(graph.name)
plt.ylabel('size')
plt.xlabel('years');

In [None]:
import numpy as np
import matplotlib.pyplot as plt
def plot_distribution(data):
    plt.figure()
    bins = np.logspace(np.log10(min(data)), np.log10(max(data)), 20)
    hist, edges = np.histogram(data, bins=bins)
#     hist_norm = hist/(bins[1:] - bins[:-1])
    plt.plot(bins[:-1], hist/len(data), '.')
    plt.yscale('log')
    plt.xscale('log')
    plt.ylim(min(hist[hist>0])/len(data)/2, 1)

In [None]:
plot_distribution(graph.graph['tfidf'][:,1].data)
plt.title(graph.name + ' before mutation')
plt.xlabel('tf-idf values');
plot_distribution(y.data)
plt.title(graph.name + ' after mutation')
plt.xlabel('tf-idf values');

### Create new nodes

#### Prior: distribution of similarities

In [None]:
neighbors = [[smp.cosine_similarity(v[:,list(graph.nodes).index(node)].transpose(),
                                    v[:,list(graph.nodes).index(neighbor)].transpose())[0,0]
              for neighbor in list(graph.successors(node))
              if neighbor is not node]# + list(graph.predecessors(node))]
             for node in graph.nodes]
neighbors = [s for ss in neighbors for s in ss]

In [None]:
non_neighbors = [[smp.cosine_similarity(v[:,list(graph.nodes).index(n1)].transpose(),
                                        v[:,list(graph.nodes).index(n2)].transpose())[0,0]
                  for n2 in graph.nodes
                  if (n2 is not n1) and (n2 not in list(graph.neighbors(n1)))]
                 for n1 in graph.nodes]
non_neighbors = [s for ss in non_neighbors for s in ss]

In [None]:
mu, std = sp.stats.norm.fit(neighbors)

In [None]:
plt.figure()
sns.distplot(neighbors)
x = np.linspace(min(neighbors), max(neighbors), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
sns.distplot(non_neighbors)
plt.title(topic)
plt.legend([f"fit-neighbors (m={mu:.2f}; s={std:.2f})", 'neighbors', 'non-neighbors'])
plt.xlabel('cos similarity');

#### Method

Just draw from normal pdf

#### Test

In [None]:
npr.normal(loc=mu, scale=std, size=4)

### Crossover

What prior should I use? It needs to be more similar than neighbors. Some kind of a t-test?

#### Prior: maybe just 3 std above mean?

In [None]:
mu + 3*std

#### Method

average? or combine elements?

In [None]:
type(tfidf[:,0].data)

In [None]:
np.ndarray(0)

In [None]:
tfidf.shape, (tfidf.shape[0],)

In [None]:
def crossover(scientists):
    """
    
    Parameters
    ----------
    scientists: 
    """
    idx = np.ndarray(0)
    data = np.ndarray(0)
    for scientist in scientists:
        pass
#         idx = np.append(idx, npr.choice())
#         data = np.append(data, npr.choice())
    y = ss.csc_matrix((data, (idx, np.zeros(idx.shape, dtype=int))),
                      shape=(scientists.shape[0],))

### Connect nodes

#### Get words from tf-idf vector

In [None]:
import pickle
import gensim.utils as gu

path_models = '/Users/harangju/Developer/data/wiki/models/'
model = gu.SaveLoad.load(path_models + 'tfidf.model')
dct = pickle.load(open(path_models + 'dict.model','rb'))

In [None]:
tfidf[:,0]

In [None]:
words = [dct[i] for i in tfidf[:,0].indices]
words[:5]

#### Prior: word weight vs title

In [None]:
for i in range(tfidf.shape[1]):
    idx_max = np.argmax(tfidf[:,i].data)
    idx = tfidf[:,i].indices[idx_max]
    word = dct[idx]
    node = list(graph.nodes)[i]
    print(i, idx_max, idx, word, node)

### Evolve
1. Initialize a bag of scientists from a set of nodes.
2. Mutate nodes. For each node,
    1. Change a word with `p_point`. Draw weight from power law prior.
    2. Delete a word with `p_delete`.
    3. Insert new word with `p_insert`. Draw weight from power law prior.
3. Crossover new nodes if `μ+3σ < similarity`.
    1. Crossover nodes into one.
4. Create new node if `x < similarity` where `x~Norm(θ)`.
    1. Connect new node.
    2. Initialize new scientist.

In [None]:
def evolve(graph, tfidf, year_end=2020):
    """Evolves a graph based on tf-idf representations."""
    year_start = max([graph.nodes[n]['year'] for n in graph.nodes])+1
    seed_nodes = {}
    for year in range(year_start, year_start+1):#year_end+1):
        for node in graph.nodes:
            if node not in seed_nodes.keys():
                seed_nodes[node] = tfidf[:,list(graph.nodes).index(node)].copy()
        for node, vec in seed_nodes.items():
            seed_nodes[node] = mutate(vec)
#         vecs = hh.stack([])
        # join seeds/crossover
        for node, vec in seed_nodes.items():
            if False:
                graph.add_node(node)
                # connect node
                tfidf = ss.hstack([tfidf,vec])
                seed_nodes[node] = None
        print(year, '\t', seed_nodes, '\n')
    return graph, tfidf

In [None]:
def algorithm(graph):
    """Grows a new graph with a core set of nodes selected from ``graph``.
    Assumes ``graph.graph['tfidf']`` holds a ``scipy.sparse.csc_matrix`` of 
    tf-idf vectors.
    """
    core = [n for n in graph.nodes if graph.nodes[n]['year'] < -2000]
    tfidf = ss.hstack([graph.graph['tfidf'][:,list(graph.nodes).index(n)] for n in core])
    subgraph = graph.subgraph(core).copy()
    subgraph.graph.clear()
    subgraph.name = graph.name + '-graft'
    new_graph, new_tfidf = evolve(subgraph, tfidf)
    new_graph.graph['tfidf'] = new_tfidf
    return new_graph

#### Distributions

In [None]:
sns.distplot(graph.graph['tfidf'].sum(0))
plt.title(graph.name)
plt.xlabel('sum of tf-idf weights');

In [None]:
# https://stackoverflow.com/questions/37170511/scaled-logarithmic-binning-in-python
plot_distribution(graph.graph['tfidf'].data)
plt.title(graph.name)
plt.xlabel('tf-idf weights');

In [None]:
for i in range(20):
    plot_distribution(graph.graph['tfidf'][:,i].data)
    plt.title(list(graph.nodes)[i])
    plt.xlabel('tf-idf weights');

In [None]:
# check after mutation