In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp

### Condensed sparse column matrices

In [None]:
data = np.array([1, 2, 3, 4, 5, 6])
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
sp.sparse.csc_matrix((data, (row, col)), shape=(3, 3)).toarray()

In [None]:
# topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
#           'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
#           'energy', 'optics', 'earth science', 'geology', 'meteorology']
topics = ['earth science']

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
len(networks[topic].graph.nodes)

In [None]:
v = networks[topic].graph.graph['tfidf']
v

In [None]:
v.sum()

In [None]:
v[:,0].indices[:5]

In [None]:
v[4,0]

In [None]:
networks[topic].graph.name

In [None]:
networks[topic].graph.nodes['Biology']

In [None]:
core = [n for n in networks[topic].graph.nodes if networks[topic].graph.nodes[n]['core_rb']>.9]
core

In [None]:
[(i,n) for i,n in enumerate(networks[topic].graph.nodes) if networks[topic].graph.nodes[n]['year']<-1800]

In [None]:
vi = v[:,9]
vi

### Distributions

In [None]:
list(networks[topic].graph.successors('Cryosphere'))

In [None]:
list(networks[topic].graph.predecessors('Cryosphere'))

In [None]:
graph = networks[topic].graph
year_diffs = [[graph.nodes[node]['year'] - graph.nodes[neighbor]['year']
               for neighbor in list(graph.successors(node))]# + list(graph.predecessors(node))]
              for node in graph.nodes]

In [None]:
year_diffs[0]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.distplot([y for ys in year_diffs for y in ys])
plt.title(topic)
plt.xlabel('year difference');

In [None]:
import sklearn.metrics.pairwise as smp

In [None]:
similarities = [[smp.cosine_similarity(v[:,list(graph.nodes).index(node)].transpose(),
                                       v[:,list(graph.nodes).index(neighbor)].transpose())[0,0]
                 for neighbor in list(graph.successors(node))
                 if neighbor is not node]# + list(graph.predecessors(node))]
                for node in graph.nodes]

In [None]:
similarities[0]

In [None]:
from scipy.stats import norm

In [None]:
mu, std = norm.fit([s for ss in similarities for s in ss])

In [None]:
sns.distplot([s for ss in similarities for s in ss])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
print("Fit results: mu = %.2f,  std = %.2f" % (mu, std))
plt.title(topic)
plt.xlabel('cos similarity');

### CSC & networkx operations

In [None]:
core = [n for n in networks[topic].graph.nodes if networks[topic].graph.nodes[n]['year']<-2000]
subgraph = graph.subgraph(core).copy()

In [None]:
import scipy.sparse as ss

In [None]:
tfidf = ss.hstack([v[:,list(graph.nodes).index(n)] for n in subgraph.nodes])
tfidf

In [None]:
subgraph.nodes

In [None]:
subgraph.add_node('Hello')

In [None]:
subgraph.nodes

### Algorithm

Initialize with core set of nodes.\
For each year,\
initialize an "baby" node for each existing node that doesn't already have a baby node,\
mutate tf-idf for each "baby" node (including the name),\
and if the "baby" node gets a probability drawn from the distribution of similarities (to what?),
the "baby" node is born.

In [None]:
def mutate(x, N_sub=2, N_add=2, idx_pool=None):
    """Mutates ``tf-idf`` representations & returns a new one.
    Selects ``N_sub=10`` words to subtract from ``x``.
    Then, selects ``N_add=10`` words to add to ``x``.
    If ``idx_pool`` specified as list of indices,
    then choose add words from ``idx_pool``.
    """
    idx_remove = np.random.choice(x.indices, size=N_remove, replace=False)
    idx_add = []
#     if idx_pool:
#         while [i for i in idx_add if i in x.indices]:
#             idx_add = np.random.choice(idx_pool, size=N_add, replace=False)
#     else:
    while True:
        idx_add = np.random.choice(x.shape[0], size=N_add, replace=False)
        if not any([i in x.indices for i in idx_add]): break
    idx_x = np.concatenate(([i for i in x.indices if i not in idx_remove],
                            np.random.choice(idx_remove, size=N_add)))
    idx_y = np.concatenate(([i for i in x.indices if i not in idx_remove],
                            idx_add))
    idx_0 = np.zeros(len(idx_y), dtype=int)
    y = csc_matrix((x[idx_x,0].data, (idx_y,idx_0)),
                   shape=x.shape)
    return y

In [None]:
def mutate(x, n_new_words=25, p_new_word=1, a=0.08, b=0.001, idx_pool=None):
    """Mutates ``tf-idf`` representations & returns a new one.
    
    Algorithm
    ---------
    - Append ``n_new_words`` new words with initial value as 0,
        each with probability ``p_new_word``.
    - If given ``idx_pool``, only add words from the pool.
    - Mutate existing elements ``x_i`` by 
        norm(1,``a``) * (``x_i`` + norm(0,``b``))
    - Zero out negatives.
    """
    idx_pool = idx_pool if idx_pool else x.shape[0]
    while True:
        idx_add = np.random.choice(idx_pool, size=n_new_words,
                                   replace=False)
        idx_add = idx_add[np.random.rand(n_new_words)<p_new_word]
        if not any([i in x.indices for i in idx_add]): break
    idx = np.concatenate((x.indices, idx_add))
    data = np.concatenate((x.data, np.zeros(idx_add.size)))
    data = np.multiply(data + norm.rvs(scale=b, size=data.size),
                       norm.rvs(loc=1, scale=a, size=data.size))
    idx = idx[data>0]
    data = data[data>0]
    y = csc_matrix((data, (idx, np.zeros(idx.shape, dtype=int))),
                   shape=x.shape)
    return y

In [None]:
import scipy.sparse as ss
from scipy.stats import norm

def algorithm(graph):
    """Grows a new graph with a core set of nodes selected from ``graph``.
    Assumes ``graph.graph['tfidf']`` holds a ``scipy.sparse.csc_matrix`` of 
    tf-idf vectors.
    """
    core = [n for n in graph.nodes if graph.nodes[n]['year'] < -2000]
    tfidf = ss.hstack([graph.graph['tfidf'][:,list(graph.nodes).index(n)] for n in core])
    subgraph = graph.subgraph(core).copy()
    subgraph.graph.clear()
    subgraph.name = graph.name + '-graft'
    new_graph, new_tfidf = evolve(subgraph, tfidf)
    new_graph.graph['tfidf'] = new_tfidf
    return new_graph

def evolve(graph, tfidf, year_end=2020):
    """Evolves a graph based on tf-idf representations."""
    year_start = max([graph.nodes[n]['year'] for n in graph.nodes])+1
    seed_nodes = {}
    for year in range(year_start, year_start+1):#year_end+1):
        for node in graph.nodes:
            if node not in seed_nodes.keys():
                seed_nodes[node] = tfidf[:,list(graph.nodes).index(node)].copy()
        for node, vec in seed_nodes.items():
            seed_nodes[node] = mutate(vec)
#         vecs = hh.stack([])
        # join seeds/crossover
        for node, vec in seed_nodes.items():
            if False:
                graph.add_node(node)
                # connect node
                tfidf = ss.hstack([tfidf,vec])
                seed_nodes[node] = None
        print(year, '\t', seed_nodes, '\n')
    return graph, tfidf

def crossover():
    pass

def mutate(x, a=0.001, b=0.001, idx_pool=None):
    """Mutates ``tf-idf`` representations & returns a new one.
    
    Algorithm
    ---------
    - Mutate existing elements ``x_i`` by 
        norm(1,``a``) * (``x_i`` + norm(0,``b``))
    - Zero out negatives.
    - Append new words by how many were deleted.
    - If given ``idx_pool``, only add words from the pool.
    """
    data = np.multiply(x.data + norm.rvs(scale=b, size=x.data.size),
                       norm.rvs(loc=1, scale=a, size=x.data.size))
    n_new_words = (data<=0).sum()
    print(n_new_words, end=' ')
    idx_pool = idx_pool if idx_pool else x.shape[0]
    while True:
        idx_add = np.random.choice(idx_pool, size=n_new_words,
                                   replace=False)
        if not any([i in x.indices for i in idx_add]): break
    idx = np.concatenate((x.indices, idx_add))
    data = np.concatenate((data, np.median(x.data)*np.ones(idx_add.size)))
    idx = idx[data>0]
    data = data[data>0]
    y = csc_matrix((data, (idx, np.zeros(idx.shape, dtype=int))),
                   shape=x.shape)
    return y

# algorithm(networks[topic].graph).graph
x = tfidf[:,1].copy()
y = tfidf[:,1].copy()
T = 500
sim = np.zeros(T)
size = np.zeros(T)
for i in range(sim.size):
    sim[i] = smp.cosine_similarity(x.transpose(),y.transpose())[0,0]
    size[i] = y.size
    y = mutate(y)
plt.figure()
sns.lineplot(x=range(sim.size), y=sim)
plt.title(graph.name)
plt.ylabel('similarity')
plt.xlabel('years');
plt.figure()
sns.lineplot(x=range(sim.size), y=size)
plt.title(graph.name)
plt.ylabel('size')
plt.xlabel('years');

In [None]:
plt.figure()
sns.distplot(graph.graph['tfidf'][:,1].data)
plt.title(graph.name + ' before mutation')
plt.yscale('log')
# plt.xscale('log')
plt.xlabel('tf-idf values');
plt.figure()
sns.distplot(y.data)
plt.title(graph.name + ' after mutation')
plt.yscale('log')
# plt.xscale('log')
plt.xlabel('tf-idf values');

In [None]:
sns.distplot(graph.graph['tfidf'].sum(0))
plt.title(graph.name)
plt.xlabel('sum of tf-idf weights');

In [None]:
sns.distplot(graph.graph['tfidf'].data)
plt.title(graph.name)
plt.yscale('log')
# plt.xscale('log')
plt.xlabel('tf-idf values');

In [None]:
for i in range(20):
    plt.figure()
    sns.distplot(graph.graph['tfidf'][:,i].data,
                 hist=True, rug=True, kde=False)
    plt.title(graph.name)
    plt.yscale('log')
    plt.xscale('log')
    plt.xlabel('tf-idf values');

In [None]:
# check after mutation