In [None]:
%reload_ext autoreload
%autoreload 2
%reload_ext cython
%reload_ext line_profiler
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

## Wiki module

### Initialization

In [None]:
# topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
#           'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
#           'energy', 'optics', 'earth science', 'geology', 'meteorology']
topics = ['earth science']
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')
graph = networks[topic].graph

In [None]:
topic = topics[0]
graph = networks[topic].graph
tfidf = graph.graph['tfidf']

In [None]:
import pickle
dct = pickle.load(open('/Users/harangju/Developer/data/wiki/models/' + 'dict.model','rb'))

### Auxiliary methods

In [None]:
%%cython -f

import numpy as np
cimport numpy as np
from cython cimport floating,boundscheck,wraparound
from cython.parallel import prange

from libc.math cimport fabs

np.import_array()

@boundscheck(False)  # Deactivate bounds checking
@wraparound(False)
def cython_manhattan(floating[::1] X_data, int[:] X_indices, int[:] X_indptr,
                     floating[::1] Y_data, int[:] Y_indices, int[:] Y_indptr,
                     double[:, ::1] D):
    """Pairwise L1 distances for CSR matrices.
    Usage:
    >>> D = np.zeros(X.shape[0], Y.shape[0])
    >>> cython_manhattan(X.data, X.indices, X.indptr,
    ...                  Y.data, Y.indices, Y.indptr,
    ...                  D)
    """
    cdef np.npy_intp px, py, i, j, ix, iy
    cdef double d = 0.0
    
    cdef int m = D.shape[0]
    cdef int n = D.shape[1]
    
    with nogil:                          
        for px in prange(m):
            for py in range(n):
                i = X_indptr[px]
                j = Y_indptr[py]
                d = 0.0
                while i < X_indptr[px+1] and j < Y_indptr[py+1]:
                    if i < X_indptr[px+1]: ix = X_indices[i]
                    if j < Y_indptr[py+1]: iy = Y_indices[j]
                    
                    if ix==iy:
                        d = d+fabs(X_data[i]-Y_data[j])
                        i = i+1
                        j = j+1
                    
                    elif ix<iy:
                        d = d+fabs(X_data[i])
                        i = i+1
                    else:
                        d = d+fabs(Y_data[j])
                        j = j+1
                
                if i== X_indptr[px+1]:
                    while j < Y_indptr[py+1]:
                        iy = Y_indices[j]
                        d = d+fabs(Y_data[j])
                        j = j+1                                            
                else:
                    while i < X_indptr[px+1]:
                        ix = X_indices[i]
                        d = d+fabs(X_data[i])
                        i = i+1
                        
                D[px,py] = d

In [None]:
import sklearn.preprocessing as skp
import sklearn.metrics.pairwise as smp

def year_diffs(graph):
    return [graph.nodes[node]['year'] - graph.nodes[neighbor]['year']
            for node in graph.nodes
            for neighbor in list(graph.successors(node))]

def neighbor_similarity(graph, tfidf):
    nodes = list(graph.nodes)
    return [smp.cosine_similarity(tfidf[:,nodes.index(node)].transpose(),
                                  tfidf[:,nodes.index(neighbor)].transpose())[0,0]
            for node in nodes
            for neighbor in list(graph.successors(node))]

def non_neighbor_similarity(graph, tfidf):
    nodes = list(graph.nodes)
    sim = [smp.cosine_similarity(tfidf[:,nodes.index(n1)].transpose(),
                                 tfidf[:,nodes.index(n2)].transpose())[0,0]
           for n1 in graph.nodes
           for n2 in graph.nodes
           if (n2 is not n1) and (n2 not in list(graph.neighbors(n1)))]
    return sim

def sparse_manhattan(X,Y=None):
    X, Y = smp.check_pairwise_arrays(X, Y)
    X = sp.sparse.csr_matrix(X, copy=False)
    Y = sp.sparse.csr_matrix(Y, copy=False)
    res = np.empty(shape=(X.shape[0],Y.shape[0]))
    cython_manhattan(X.data,X.indices,X.indptr,
                     Y.data,Y.indices,Y.indptr,
                             res)
    return res

def word_diffs(graph, tfidf):
    dists = sparse_manhattan(X=skp.binarize(tfidf).transpose())
    nodes = list(graph.nodes)
    return [dists[nodes.index(node), nodes.index(neighbor)]
            for node in nodes
            for neighbor in list(graph.successors(node))]

def weight_differences(graph, tfidf):
    nodes = list(graph.nodes)
    diff = []
    for node in nodes:
        for neighbor in graph.successors(node):
            v1 = tfidf[:,nodes.index(node)]
            v2 = tfidf[:,nodes.index(neighbor)]
            idx = np.concatenate([v1.indices, v2.indices])
            diff.append( np.sum(np.absolute(v1[idx]-v2[idx])) )
    return diff

def plot_distribution(data):
    bins = np.logspace(np.log10(min(data)), np.log10(max(data)), 30)
    hist, edges = np.histogram(data, bins=bins)
#     hist_norm = hist/(bins[1:] - bins[:-1])
    sns.scatterplot(bins[:-1], hist/len(data))
    plt.yscale('log')
    plt.xscale('log')
    plt.xlim(bins[0]/2, bins[-1]*2)
    plt.ylim(min(hist[hist>0])/len(data)/2, 1)
    plt.xlabel('x')
    plt.ylabel('P(x)')

### Priors

Run **Auxiliary methods** below.

#### Prior: power law distributions of weights

In [None]:
# import powerlaw
# tfidf = graph.graph['tfidf']
# fit = powerlaw.Fit(tfidf.data)
fit.plot_pdf()
fit.power_law.plot_pdf();
plt.title(f"xmin={fit.xmin:.1e}, α={fit.alpha:.1f}");

#### Prior: similarity / year between neighbors

In [None]:
plt.figure(figsize=(16,24))
plt.subplot(421)
yd = year_diffs(graph)
sns.distplot(yd)
plt.xlabel('Δyear')
plt.ylabel('distribution')

# wd = word_diffs(graph, tfidf)
mu, std = sp.stats.norm.fit(wd)
plt.subplot(423)
sns.distplot(wd)
x = np.linspace(min(wd), max(wd), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.legend([f"m={mu:.2f}; s={std:.2f}"])
plt.xlabel('manhattan distance')
plt.ylabel('distribution');

slope, intercept, fit_r, p, stderr = sp.stats.linregress(np.abs(yd), wd)
plt.subplot(424)
wd = word_diffs(graph, tfidf)
sns.scatterplot(x=np.abs(yd), y=wd)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"r={fit_r:.2f}; p={p:.1e}")
plt.legend([f"slope={slope:.2f}"])
plt.xlabel('Δyear')
plt.ylabel('manhattan distance');

neighbors = neighbor_similarity(graph, tfidf)
# non_neighbors = non_neighbor_similarity(graph, tfidf)
fit_mu, fit_std = sp.stats.norm.fit(neighbors)
plt.subplot(425)
sns.distplot(neighbors, hist=True)
x = np.linspace(min(neighbors), max(neighbors), 100)
plt.plot(x, sp.stats.norm.pdf(x, fit_mu, fit_std))
sns.distplot(non_neighbors)
plt.legend([f"fit-neighbors (m={fit_mu:.2f}; s={fit_std:.2f})", 'neighbors', 'non-neighbors'])
plt.xlabel('cos similarity');

slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd), neighbors)
plt.subplot(426)
neighbors = neighbor_similarity(graph, tfidf)
sns.scatterplot(x=np.abs(yd), y=neighbors)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"r={r:.2f}; p={p:.1f}")
plt.legend([f"slope={slope:.2f}"])
plt.xlabel('Δyear')
plt.ylabel('cosine similarity');

weight_diffs = weight_differences(graph, tfidf)
mu, std = sp.stats.norm.fit(weight_diffs)
plt.subplot(427)
sns.distplot(weight_diffs)
x = np.linspace(min(weight_diffs), max(weight_diffs), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.legend([f"m={mu:.2f}; s={std:.2f}"])
plt.xlabel('Σ abs Δw_i')
plt.ylabel('distribution');

slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd), weight_diffs)
plt.subplot(428)
sns.scatterplot(x=np.abs(yd), y=weight_diffs)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"r={r:.2f}; p={p:.1e}")
plt.legend([f"slope={slope:.1e}"])
plt.xlabel('Δyear')
plt.ylabel('Σ abs Δw_i');

#### Prior: weight distributions of nodes

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(121)
sns.scatterplot(x='index', y='weight',
                data=pd.DataFrame({'index': tfidf.indices,
                                   'weight': tfidf.data}))
sns.scatterplot(x='index', y='weight',
                data=pd.DataFrame({'index': tfidf.indices,
                                   'weight': tfidf.data})\
                       .groupby('index').mean()\
                       .reset_index())
plt.legend(['weights', 'averaged'])
plt.ylim([-.2,1.2])
plt.subplot(122)
plot_distribution(tfidf.data)

#### Prior: year distribution

In [None]:
plt.figure(figsize=(8,6))
bin_size=25
years = [graph.nodes[node]['year'] for node in graph.nodes]
sns.distplot(years, bins=bin_size, rug=True, kde=False)
hist, bin_edges = np.histogram(years, bins=bin_size)
popt, pcov = sp.optimize.curve_fit(lambda x,a,b: a*pow(b,x), bin_edges[1:], hist)
x = np.linspace(min(years), max(years), 100)
sns.lineplot(x=x, y=popt[0]*pow(popt[1],x))
plt.legend([f"a*b^x; a={popt[0]:.1e}, b={popt[1]:.2f}"])
plt.xlabel('year');

#### Prior: word weight vs title

In [None]:
stoplist=set('for a of the and to in'.split())
nodes = []
words = []
for i in range(tfidf.shape[1]):
    node = list(graph.nodes)[i]
    if tfidf[:,i].data.size == 0:
        print(node, tfidf[:,i].data)
        continue
    top_words, idx = wiki.Model.find_top_words(tfidf[:,i], dct, top_n=5)
    nodes += [node]
    words += [top_words]
pd.DataFrame(data={'Node': nodes, 'Top words': words})

### Static methods

#### Mutate

In [None]:
x = tfidf[:,0].copy()
y = tfidf[:,0].copy()
T = 300

sim = np.zeros(T)
size = np.zeros(T)
mag = np.zeros(T)
for i in range(sim.size):
    sim[i] = smp.cosine_similarity(x.transpose(),y.transpose())[0,0]
    size[i] = y.size
    mag[i] = np.sum(y.data)
    y = wiki.Model.mutate(y, lambda n: fit.power_law.generate_random(n),
                          point=(1,.5), insert=(1,.3,None), delete=(1,.3))

plt.figure(figsize=(16,10))
ax = plt.subplot(221)
sns.lineplot(x=range(sim.size), y=sim)
plt.ylabel('similarity')
# ax2 = ax.twinx()
# sns.lineplot(x=range(sim.size), y=mag, ax=ax2, color='darkorange')
# plt.ylabel('magnitude')
plt.xlabel('years')

plt.subplot(222)
sns.lineplot(x=range(sim.size), y=size)
plt.ylabel('size')
plt.xlabel('years')

plt.subplot(223)
plot_distribution(x.data)
plot_distribution(y.data)
plt.xlabel('tf-idf values')
plt.legend(['before mutation', 'after mutation'])
plt.xlabel('tf-idf values')

plt.subplot(224)
plot_distribution(x.data)
plot_distribution(y.data)
plt.xlabel('tf-idf values')
plt.yscale('linear')
plt.xscale('linear')
plt.ylim([0,.2])
plt.xlim([0,.1])
plt.legend(['before mutation','after mutation']);

#### Connect

In [None]:
model = wiki.Model(graph_parent=networks[topic].graph,
                   vectors_parent=networks[topic].graph.graph['tfidf'],
                   year_start=-500)

In [None]:
test_graph = model.graph.copy()
test_vector = sp.sparse.hstack([tfidf[:,list(graph.nodes).index(n)] for n in test_graph.nodes])

seed = 'Meteorology'
seed_vector = tfidf[:,list(graph.nodes).index(seed)]

print('Nodes:', test_graph.nodes)
print('Edges:', test_graph.edges, '\n')
print(f"Seed: {seed}\n")
wiki.Model.connect(seed_vector, test_graph, test_vector, dct, match_n=3)
print('Nodes:', test_graph.nodes)
print('Edges:', test_graph.edges)

#### Crossover

In [None]:
nodes = list(graph.nodes)[:20]
seeds = {node: [tfidf[:,list(graph.nodes).index(node)],
                tfidf[:,list(graph.nodes).index(node)]]
         for node in nodes}
print(nodes, '\n')
vectors = sp.sparse.hstack([v for node in nodes for v in seeds[node]])
print(np.triu(smp.cosine_similarity(vectors.transpose())))
wiki.Model.crossover_seeds(seeds, graph, tfidf, threshold=0.5)
print('\n----------------------------------------------------------\n')
vectors = sp.sparse.hstack([v for node in nodes if node in seeds.keys() for v in seeds[node]])
print(np.triu(smp.cosine_similarity(vectors.transpose())))

### Evolve

In [None]:
model = wiki.Model(graph_parent=networks[topic].graph,
                   vectors_parent=networks[topic].graph.graph['tfidf'],
                   year_start=-500)
print(f"Topic: '{graph.name}'\n" +\
      f"Core nodes\n" +\
      f"   {list(model.graph.nodes)}\n" +\
      f"Parameters\n" +\
      f"   α (power law): {fit.alpha:.2f}\n" +\
      f"   p_insert/delete: {fit_r:.2f}/2\n" +\
      f"   neighbor_mu, std: {fit_mu:.2f}, {fit_std:.2f}\n" +\
      f"   crossover threshold: {fit_mu+3*fit_std:.2f}")

In [None]:
max_val = np.max(tfidf.data)
%lprun -f model.evolve model.evolve(year_end=2000,\
                                    n_seeds=2,\
                                    point=(1, 2*fit_std),\
                                    insert=(1, fit_r/2, list(set(tfidf.indices))),\
                                    delete=(1, fit_r/2),\
                                    rvs=lambda n: np.vectorize(lambda x: max_val if x>max_val else x,\
                                                               otypes=[np.float64])\
                                                              (fit.power_law.generate_random(n)),\
                                    dct=dct, create=lambda n: np.random.normal(loc=fit_mu+fit_std,\
                                                                               scale=fit_std, size=n),\
                                    crossover=fit_mu+3*fit_std)
model.record

In [None]:
s = lambda a,b: smp.cosine_similarity(a.transpose(), b.transpose())[0,0]
nodes = list(model.graph.nodes)
model.record['Similarity to parent'] = [s(model.record.iloc[i]['Seed vectors'],
                                          model.vectors[:,nodes.index(model.record.iloc[i]['Parent'])])
                                        for i in range(len(model.record.index))]
model.record['Parent seed'] = model.record['Parent'] + ' ' + model.record['Seed number'].map(str)

In [None]:
plt.figure(figsize=(16,10))
sns.lineplot(x='Year', y='Similarity to parent', hue='Parent seed', data=model.record);

#### Save/load graph

In [None]:
subgraph.graph['tfidf'] = vectors
nx.write_gpickle(subgraph, f"graph{}.pickle")

In [None]:
subgraph = nx.read_gpickle('graph.pickle')
vectors = subgraph.graph['tfidf']

### Posteriors

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
sns.distplot(neighbors)
x = np.linspace(min(neighbors), max(neighbors), 100)
mu, std = sp.stats.norm.fit(neighbors)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
sns.distplot(non_neighbors)
plt.title(topic + ' (prior)')
plt.legend([f"fit-neighbors (m={mu:.2f}; s={std:.2f})", 'neighbors', 'non-neighbors'])
plt.xlabel('cos similarity');
plt.xlim([-.2,1.2])
plt.subplot(122)
neighbors_model = neighbor_similarity(subgraph, vectors)
non_neighbors_model = non_neighbor_similarity(subgraph, vectors)
sns.distplot(neighbors_model)
x = np.linspace(min(neighbors_model), max(neighbors_model), 100)
mu, std = sp.stats.norm.fit(neighbors_model)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
sns.distplot(non_neighbors_model)
plt.title(topic + ' (model)')
plt.legend([f"fit-neighbors (m={mu:.2f}; s={std:.2f})", 'neighbors', 'non-neighbors'])
plt.xlabel('cos similarity')
plt.xlim([-.2,1.2]);

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
sns.distplot([graph.nodes[node]['year'] for node in graph.nodes], rug=True)
plt.xlim([-5000,2100])
plt.title('prior')
plt.ylabel('discoveries')
plt.xlabel('year')
plt.subplot(122)
sns.distplot([subgraph.nodes[node]['year'] for node in subgraph.nodes], rug=True)
plt.xlim([-5000,3100])
plt.title('model')
plt.ylabel('discoveries')
plt.xlabel('year');

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(121)
fit.plot_pdf()
fit.power_law.plot_pdf()
plt.title(f"empirical xmin={fit.xmin:.1e}, α={fit.alpha:.1f}");
plt.subplot(122)
fit_model = powerlaw.Fit(vectors.data)
fit_model.plot_pdf()
fit_model.power_law.plot_pdf()
plt.title(f"model xmin={fit_model.xmin:.1e}, α={fit_model.alpha:.1f}");

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
sns.distplot(yd)
plt.title(topic + ' prior')
plt.xlabel('year difference')
plt.subplot(122)
yd_model = year_diffs(subgraph)
sns.distplot(yd_model)
plt.title(topic + ' model')
plt.xlabel('year difference');

In [None]:
plt.figure(figsize=(16,10))
plt.subplot(221)
sns.scatterplot(x=np.abs(yd), y=wd)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd), wd)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (prior)")
plt.xlabel('year')
plt.ylabel('manhattan distance');

plt.subplot(222)
sns.distplot(wd)
mu, std = sp.stats.norm.fit(wd)
x = np.linspace(min(wd), max(wd), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.xlabel('manhattan distance')
plt.ylabel('probability distribution');
plt.title(f"μ={mu:.2}, σ={std:.2} (prior)")

wd_model = word_diffs(subgraph, vectors)

plt.subplot(223)
sns.scatterplot(x=np.abs(yd_model), y=wd_model)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd_model), wd_model)
x = np.linspace(0, max(yd_model), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (model)")
plt.xlabel('year')
plt.ylabel('manhattan distance');

plt.subplot(224)
sns.distplot(wd_model)
mu, std = sp.stats.norm.fit(wd_model)
x = np.linspace(min(wd_model), max(wd_model), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.xlabel('manhattan distance')
plt.ylabel('probability distribution');
plt.title(f"μ={mu:.2}, σ={std:.2} (model)");

In [None]:
neighbors_model = neighbor_similarity(subgraph, vectors)

plt.figure(figsize=(16,6))
plt.subplot(121)
sns.scatterplot(x=np.abs(yd), y=neighbors)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd), neighbors)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (prior)")
plt.xlabel('Δyear')
plt.ylabel('cosine similarity');
plt.subplot(122)
sns.scatterplot(x=np.abs(yd_model), y=neighbors_model)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd_model), neighbors_model)
x = np.linspace(0, max(yd_model), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (model)")
plt.xlabel('Δyear')
plt.ylabel('cosine similarity');

In [None]:
plt.figure(figsize=(16,6))

plt.subplot(121)
sns.scatterplot(x='index', y='weight',
                data=pd.DataFrame({'index': vectors.indices,
                                   'weight': vectors.data}))
plt.ylim([-.1,1.1]);

plt.subplot(122)
plot_distribution(vectors.data)

In [None]:
plt.figure(figsize=(16,10))
plt.subplot(121)
nx.draw_networkx(graph, node_color=['r' if graph.nodes[n]['year']<start_year else 'b'
                                    for n in graph.nodes])
plt.title('original graph')
plt.subplot(122)
nx.draw_networkx(subgraph, node_color=['r' if subgraph.nodes[n]['year']<start_year else 'b'
                                       for n in subgraph.nodes])
plt.title('new graph');

### Discussion

The point of this model is that one can model knowledge discovery as incremental changes on existing knowledge.

The mutation model doesn't monotonically decrease similarity with parent.