In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp

### Condensed sparse column matrices

In [None]:
data = np.array([1, 2, 3, 4, 5, 6])
row = np.array([0, 2, 2, 0, 1, 2])
col = np.array([0, 0, 1, 2, 2, 2])
sp.sparse.csc_matrix((data, (row, col)), shape=(3, 3)).toarray()

In [None]:
# topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
#           'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
#           'energy', 'optics', 'earth science', 'geology', 'meteorology']
topics = ['earth science']

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
len(networks[topic].graph.nodes)

In [None]:
v = networks[topic].graph.graph['tfidf']
v

In [None]:
v.sum()

In [None]:
v[:,0].indices[:5]

In [None]:
v[4,0]

In [None]:
networks[topic].graph.name

In [None]:
networks[topic].graph.nodes['Biology']

In [None]:
core = [n for n in networks[topic].graph.nodes if networks[topic].graph.nodes[n]['core_rb']>.9]
core

In [None]:
[(i,n) for i,n in enumerate(networks[topic].graph.nodes) if networks[topic].graph.nodes[n]['year']<-1800]

In [None]:
vi = v[:,9]
vi

### Distributions

In [None]:
list(networks[topic].graph.successors('Cryosphere'))

In [None]:
list(networks[topic].graph.predecessors('Cryosphere'))

In [None]:
graph = networks[topic].graph
year_diffs = [[graph.nodes[node]['year'] - graph.nodes[neighbor]['year']
               for neighbor in list(graph.successors(node))]# + list(graph.predecessors(node))]
              for node in graph.nodes]

In [None]:
year_diffs[0]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
sns.distplot([y for ys in year_diffs for y in ys])
plt.title(topic)
plt.xlabel('year difference');

In [None]:
import sklearn.metrics.pairwise as smp

In [None]:
similarities = [[smp.cosine_similarity(v[:,list(graph.nodes).index(node)].transpose(),
                                       v[:,list(graph.nodes).index(neighbor)].transpose())[0,0]
                 for neighbor in list(graph.successors(node))
                 if neighbor is not node]# + list(graph.predecessors(node))]
                for node in graph.nodes]

In [None]:
similarities[0]

In [None]:
from scipy.stats import norm

In [None]:
mu, std = norm.fit([s for ss in similarities for s in ss])

In [None]:
sns.distplot([s for ss in similarities for s in ss])
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)
plt.plot(x, p, 'k', linewidth=2)
print("Fit results: mu = %.2f,  std = %.2f" % (mu, std))
plt.title(topic + '; neighbors')
plt.xlabel('cos similarity');

In [None]:
non_neighbors = [[smp.cosine_similarity(v[:,list(graph.nodes).index(n1)].transpose(),
                                        v[:,list(graph.nodes).index(n2)].transpose())[0,0]
                  for n2 in graph.nodes
                  if (n2 is not n1) and (n2 not in list(graph.neighbors(n1)))]
                 for n1 in graph.nodes]

In [None]:
plt.figure
sns.distplot([s for ss in similarities for s in ss])
sns.distplot([s for ss in non_neighbors for s in ss])
print("Fit results: mu = %.2f,  std = %.2f" % (mu, std))
plt.title(topic)
plt.legend(['neighbors', 'non-neighbors'])
plt.xlabel('cos similarity');

### CSC & networkx operations

In [None]:
core = [n for n in networks[topic].graph.nodes if networks[topic].graph.nodes[n]['year']<-2000]
subgraph = graph.subgraph(core).copy()

In [None]:
import scipy.sparse as ss

In [None]:
tfidf = ss.hstack([v[:,list(graph.nodes).index(n)] for n in subgraph.nodes])
tfidf

In [None]:
subgraph.nodes

In [None]:
subgraph.add_node('Hello')

In [None]:
subgraph.nodes

### Random numbers

In [None]:
import powerlaw
import matplotlib.pyplot as plt
tpl = powerlaw.Truncated_Power_Law(xmin=1e-2, parameters=[1, 1e-3])
x = tpl.generate_random(3000)
powerlaw.plot_pdf(x)
plt.show()

In [None]:
pl = powerlaw.Power_Law(xmin=1., parameters=[1.5])
powerlaw.plot_pdf(pl.generate_random(1000))
plt.show()

### Algorithm

Initialize with core set of nodes.\
For each year,\
initialize an "baby" node for each existing node that doesn't already have a baby node,\
mutate tf-idf for each "baby" node (including the name),\
and if the "baby" node gets a probability drawn from the distribution of similarities (to what?),
the "baby" node is born.

In [None]:
import sklearn.metrics.pairwise as smp
import scipy.sparse as ss
from scipy.stats import norm
import powerlaw

def algorithm(graph):
    """Grows a new graph with a core set of nodes selected from ``graph``.
    Assumes ``graph.graph['tfidf']`` holds a ``scipy.sparse.csc_matrix`` of 
    tf-idf vectors.
    """
    core = [n for n in graph.nodes if graph.nodes[n]['year'] < -2000]
    tfidf = ss.hstack([graph.graph['tfidf'][:,list(graph.nodes).index(n)] for n in core])
    subgraph = graph.subgraph(core).copy()
    subgraph.graph.clear()
    subgraph.name = graph.name + '-graft'
    new_graph, new_tfidf = evolve(subgraph, tfidf)
    new_graph.graph['tfidf'] = new_tfidf
    return new_graph

def evolve(graph, tfidf, year_end=2020):
    """Evolves a graph based on tf-idf representations."""
    year_start = max([graph.nodes[n]['year'] for n in graph.nodes])+1
    seed_nodes = {}
    for year in range(year_start, year_start+1):#year_end+1):
        for node in graph.nodes:
            if node not in seed_nodes.keys():
                seed_nodes[node] = tfidf[:,list(graph.nodes).index(node)].copy()
        for node, vec in seed_nodes.items():
            seed_nodes[node] = mutate(vec)
#         vecs = hh.stack([])
        # join seeds/crossover
        for node, vec in seed_nodes.items():
            if False:
                graph.add_node(node)
                # connect node
                tfidf = ss.hstack([tfidf,vec])
                seed_nodes[node] = None
        print(year, '\t', seed_nodes, '\n')
    return graph, tfidf

def crossover():
    pass

def walk(x, rvs=lambda: norm.rvs):
    """
    
    Parameters
    ----------
    x: spipy.sparse.csc_matrix
    rvs: lambda ()-> float
        returns a random distance
    """
    data = x.data
    idx = x.indices
    # which vector elements should I modify, maintain sparseness
    # L1 norm?
    y = ss.csc_matrix((data, (idx, np.zeros(idx.shape, dtype=int))),
                      shape=x.shape)
    return y

In [None]:
import seaborn as sn
# algorithm(networks[topic].graph).graph
graph = networks[topic].graph
tfidf = graph.graph['tfidf']
x = tfidf[:,1].copy()
y = tfidf[:,1].copy()
T = 10
sim = np.zeros(T)
size = np.zeros(T)
for i in range(sim.size):
    sim[i] = smp.cosine_similarity(x.transpose(),y.transpose())[0,0]
    size[i] = y.size
#     y = mutate(y)
    y = walk(y)
plt.figure()
sn.lineplot(x=range(sim.size), y=sim)
plt.title(graph.name)
plt.ylabel('similarity')
plt.xlabel('years');
plt.figure()
sn.lineplot(x=range(sim.size), y=size)
plt.title(graph.name)
plt.ylabel('size')
plt.xlabel('years');

In [None]:
# not mutate, but a walk
# magnitude: Levy vs random
# direction: random vs objective

#### Draw distances from distribution & calculate direction

In [None]:
x = tfidf[:,1]
idx = x.indices
data = x.data

In [None]:
idx.size

In [None]:
distr = lambda: norm.rvs()
distance = distr()
distance = distance if distance >= 0 else distance*-1

In [None]:
vi = np.zeros(idx.shape)
balance = distance
for i in range(idx.size):#np.random.permutation(idx.size):
    sign = 2*np.random.randint(2)-1
    vi[i] = sign * np.sqrt(balance) * np.random.rand()
#     print(sign, vi[i], end=' ')
    balance -= vi[i]**2
    if balance < 0: break
print(distance, np.sqrt(np.sum(np.power(vi,2))))

#### Distributions

In [None]:
plt.figure()
data = graph.graph['tfidf'][:,1].data
bins = np.logspace(np.log10(min(data)), np.log10(max(data)), 30)
hist, edges = np.histogram(data, bins=bins)
plt.plot(bins[:-1], hist/(bins[1:] - bins[:-1]), '.')
plt.title(graph.name + ' before mutation')
plt.yscale('log')
plt.xscale('log')
plt.xlabel('tf-idf values');
plt.figure()
sns.distplot(y.data)
plt.title(graph.name + ' after mutation')
plt.yscale('log')
# plt.xscale('log')
plt.xlabel('tf-idf values');

In [None]:
sns.distplot(graph.graph['tfidf'].sum(0))
plt.title(graph.name)
plt.xlabel('sum of tf-idf weights');

In [None]:
# https://stackoverflow.com/questions/37170511/scaled-logarithmic-binning-in-python
data = graph.graph['tfidf'].data
bins = np.logspace(np.log10(min(data)), np.log10(max(data)), 30)
hist, edges = np.histogram(data, bins=bins)
plt.plot(bins[:-1], hist/(bins[1:] - bins[:-1]), '.')
plt.title(graph.name)
plt.yscale('log')
plt.xscale('log')
plt.xlabel('tf-idf weights');

In [None]:
for i in range(20):
    plt.figure()
    data = graph.graph['tfidf'][:,i].data
    bins = np.logspace(np.log10(min(data)), np.log10(max(data)), 30)
    hist, edges = np.histogram(data, bins=bins)
    plt.plot(bins[:-1], hist/(bins[1:] - bins[:-1]), '.')
    plt.title(graph.name)
    plt.yscale('log')
    plt.xscale('log')
    plt.xlabel('tf-idf weights');

In [None]:
# check after mutation