## Initialization

In [None]:
%reload_ext autoreload
%autoreload 2
%reload_ext cython
%reload_ext line_profiler
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp
import seaborn as sns
import cufflinks as cf
import matplotlib.pyplot as plt

In [None]:
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
          'energy', 'optics', 'earth science', 'geology', 'meteorology',
          'philosophy of language', 'philosophy of law', 'philosophy of mind',
          'philosophy of science', 'economics', 'accounting', 'education',
          'linguistics', 'law', 'psychology', 'sociology', 'electronics',
          'software engineering', 'robotics',
          'calculus', 'geometry', 'abstract algebra',
          'Boolean algebra', 'commutative algebra', 'group theory', 'linear algebra',
          'number theory', 'dynamical systems and differential equations']

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'

In [None]:
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
import pickle
dct = pickle.load(open('/Users/harangju/Developer/data/wiki/models/' + 'dict.model','rb'))

## Auxiliary methods

In [None]:
%%cython -f

import numpy as np
cimport numpy as np
from cython cimport floating,boundscheck,wraparound
from cython.parallel import prange

from libc.math cimport fabs

np.import_array()

@boundscheck(False)  # Deactivate bounds checking
@wraparound(False)
def cython_manhattan(floating[::1] X_data, int[:] X_indices, int[:] X_indptr,
                     floating[::1] Y_data, int[:] Y_indices, int[:] Y_indptr,
                     double[:, ::1] D):
    """Pairwise L1 distances for CSR matrices.
    Usage:
    >>> D = np.zeros(X.shape[0], Y.shape[0])
    >>> cython_manhattan(X.data, X.indices, X.indptr,
    ...                  Y.data, Y.indices, Y.indptr,
    ...                  D)
    """
    cdef np.npy_intp px, py, i, j, ix, iy
    cdef double d = 0.0
    
    cdef int m = D.shape[0]
    cdef int n = D.shape[1]
    
    with nogil:                          
        for px in prange(m):
            for py in range(n):
                i = X_indptr[px]
                j = Y_indptr[py]
                d = 0.0
                while i < X_indptr[px+1] and j < Y_indptr[py+1]:
                    if i < X_indptr[px+1]: ix = X_indices[i]
                    if j < Y_indptr[py+1]: iy = Y_indices[j]
                    
                    if ix==iy:
                        d = d+fabs(X_data[i]-Y_data[j])
                        i = i+1
                        j = j+1
                    
                    elif ix<iy:
                        d = d+fabs(X_data[i])
                        i = i+1
                    else:
                        d = d+fabs(Y_data[j])
                        j = j+1
                
                if i== X_indptr[px+1]:
                    while j < Y_indptr[py+1]:
                        iy = Y_indices[j]
                        d = d+fabs(Y_data[j])
                        j = j+1                                            
                else:
                    while i < X_indptr[px+1]:
                        ix = X_indices[i]
                        d = d+fabs(X_data[i])
                        i = i+1
                        
                D[px,py] = d

In [None]:
import sklearn.preprocessing as skp
import sklearn.metrics.pairwise as smp

def year_diffs(graph):
    return [graph.nodes[node]['year'] - graph.nodes[neighbor]['year']
            for node in graph.nodes
            for neighbor in list(graph.successors(node))]

def neighbor_similarity(graph, tfidf):
    nodes = list(graph.nodes)
    return [smp.cosine_similarity(tfidf[:,nodes.index(node)].transpose(),
                                  tfidf[:,nodes.index(neighbor)].transpose())[0,0]
            for node in nodes
            for neighbor in list(graph.successors(node))]

def sparse_manhattan(X,Y=None):
    X, Y = smp.check_pairwise_arrays(X, Y)
    X = sp.sparse.csr_matrix(X, copy=False)
    Y = sp.sparse.csr_matrix(Y, copy=False)
    res = np.empty(shape=(X.shape[0],Y.shape[0]))
    cython_manhattan(X.data,X.indices,X.indptr,
                     Y.data,Y.indices,Y.indptr,
                             res)
    return res

def word_diffs(graph, tfidf):
    dists = sparse_manhattan(X=skp.binarize(tfidf).transpose())
    nodes = list(graph.nodes)
    return [dists[nodes.index(node), nodes.index(neighbor)]
            for node in nodes
            for neighbor in list(graph.successors(node))]

def sum_abs_weight_differences(graph, tfidf):
    nodes = list(graph.nodes)
    diff = []
    for node in nodes:
        for neighbor in graph.successors(node):
            v1 = tfidf[:,nodes.index(node)]
            v2 = tfidf[:,nodes.index(neighbor)]
            idx = np.concatenate([v1.indices, v2.indices])
            diff.append( np.sum(np.absolute(v1[idx]-v2[idx])) )
    return diff

def sum_weight_differences(graph, tfidf):
    nodes = list(graph.nodes)
    diff = []
    for node in nodes:
        for neighbor in graph.successors(node):
            v1 = tfidf[:,nodes.index(node)]
            v2 = tfidf[:,nodes.index(neighbor)]
            idx = np.concatenate([v1.indices, v2.indices])
            diff.append( np.sum(v1[idx]-v2[idx]) )
    return diff

def bin_distribution(data, steps=30, scale='log'):
    if scale=='log':
        bins = np.logspace(np.log10(np.min(data)), np.log10(np.max(data)), steps)
    elif scale=='linear':
        bins = np.linspace(np.min(data), np.max(data), num=steps)
    hist, edges = np.histogram(data, bins=bins)
    return hist, edges, bins

def plot_distribution(data):
    hist, edges, bins = bin_distribution(data)
#     hist_norm = hist/(bins[1:] - bins[:-1])
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=bins[:-1],
                             y=hist/len(data),
                             mode='markers'))
    fig.update_layout(template='plotly_white',
                      xaxis={'type': 'log',
                             'title': 'x'},
                      yaxis={'type': 'log',
                             'title': 'P(x)'})
    fig.show()
    return fig

## Priors

In [None]:
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
path_fig = '/Users/harangju/Box Sync/Research/my papers/wikipedia/results/'
save_fig = False

In [None]:
import os

topic = 'anatomy'
path_plot = '3 model ex prior'

if not os.path.exists(os.path.join(path_fig, path_plot, topic)):
    os.mkdir(os.path.join(path_fig, path_plot, topic))

In [None]:
fig = plot_distribution(networks[topic].graph.graph['tfidf'].data)
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'tf_idf_distribution.pdf'))

In [None]:
yd = year_diffs(networks[topic].graph)
wd = word_diffs(networks[topic].graph, networks[topic].graph.graph['tfidf'])
a, b, fit_r, p, stderr = sp.stats.linregress(np.abs(yd), wd)
fig = go.Figure()
x = np.linspace(0, max(yd), 100)
fig.add_trace(go.Scatter(x=np.abs(yd), y=wd,
                         mode='markers',
                         marker={'size': 3},
                         name='edges'))
fig.add_trace(go.Scatter(x=x, y=np.multiply(a, x) + b,
                         name=f"y = {a:.1f} x + {b:.1f}"))
fig.update_layout(template='plotly_white',
                  title=f"{topic} (r = {fit_r:.2f}, p = {p:.1e})",
                  xaxis={'title': 'Δyear'},
                  yaxis={'title': 'manhattan distance'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'manhattan.pdf'))

In [None]:
sum_weight_diffs = sum_weight_differences(networks[topic].graph,
                                          networks[topic].graph.graph['tfidf'])
a, b, fit_r_sum_weight, p, stderr = sp.stats.linregress(np.abs(yd), sum_weight_diffs)
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.abs(yd),
                         y=sum_weight_diffs,
                         mode='markers',
                         marker={'size': 3},
                         name='edges'))
x = np.linspace(0, max(yd), 100)
fig.add_trace(go.Scatter(x=x, y=np.multiply(a, x) + b,
                         name=f"y = {a:.1e} x + {b:.1f}\n"))
fig.update_layout(template='plotly_white',
                  title=f"{topic} (r = {fit_r_sum_weight:.2f}; p = {p:.1e})",
                  xaxis={'title': 'Δyear'},
                  yaxis={'title': 'Σ Δw_i'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'sum_diff_weights.pdf'))

The distribution of word weights are not changing significantly across time.

In [None]:
import plotly.figure_factory as ff
a, b, fit_r_sum_weight, p, stderr = sp.stats.linregress(np.abs(yd), sum_weight_diffs)
mu_swd, std_swd = np.mean(sum_weight_diffs), np.std(sum_weight_diffs)
fig = ff.create_distplot([sum_weight_diffs], ['edges'], bin_size=1)
x = np.linspace(min(sum_weight_diffs), max(sum_weight_diffs), 100)
fig.add_trace(go.Scatter(x=x, y=sp.stats.norm.pdf(x, mu_swd, std_swd),
                         name='normal fit'))
fig.update_layout(template='plotly_white', title=topic,
                  xaxis={'title': 'Σ Δw_i'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'sum_diff_weights_dstr.pdf'))

In [None]:
sum_abs_weight_diffs = sum_abs_weight_differences(networks[topic].graph,
                                                  networks[topic].graph.graph['tfidf'])
a, b, fit_r_sum_abs_weight, p, stderr = sp.stats.linregress(np.abs(yd), sum_abs_weight_diffs)
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.abs(yd),
                         y=sum_abs_weight_diffs,
                         mode='markers',
                         marker={'size': 3},
                         name='edges'))
x = np.linspace(0, max(yd), 100)
fig.add_trace(go.Scatter(x=x, y=np.multiply(a, x) + b,
                         name=f"y = {a:.1e} x + {b:.1f}\n"))
fig.update_layout(template='plotly_white',
                  title=f"{topic} (r = {fit_r_sum_abs_weight:.2f}; p = {p:.1e})",
                  xaxis={'title': 'Δyear'},
                  yaxis={'title': 'Σ |Δw_i|'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'sum_abs_diff_weights.pdf'))

In [None]:
neighbors = neighbor_similarity(networks[topic].graph, networks[topic].graph.graph['tfidf'])
mu_n, std_n = sp.stats.norm.fit(neighbors)
mu_n, std_n

In [None]:
import plotly.figure_factory as ff
fig = ff.create_distplot([neighbors], ['edges'], bin_size=.05)
x = np.linspace(min(neighbors), max(neighbors), 100)
fig.add_trace(go.Scatter(x=x, y=sp.stats.norm.pdf(x, mu_n, std_n),
                         name='normal fit'))
fig.update_layout(template='plotly_white', title=topic,
                  xaxis={'title': 'cosine similarity'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'cosine_distribution.pdf'))

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
stoplist=set('for a of the and to in'.split())
nodes = []
words = []
graph = networks[topic].graph
tfidf = networks[topic].graph.graph['tfidf']
for i in range(tfidf.shape[1]):
    node = list(graph.nodes)[i]
    if tfidf[:,i].data.size == 0:
#         print(node, tfidf[:,i].data)
        continue
    top_words, idx = wiki.Model.find_top_words(tfidf[:,i], dct, top_n=5)
    nodes += [node]
    words += [top_words]
pd.DataFrame(data={'Node': nodes, 'Top words': words})

In [None]:
n=10
k=4
x = np.sum(np.abs(np.random.randint(0,n,(k,100000))-np.random.randint(0,n,(k,100000))), axis=0)
# np.mean(x), k*2*np.sum(np.arange(1,n)*np.flip(np.arange(1,n))) * ((1/n)**2)
np.mean(x), k * np.sum( ((1/n)**2) * np.sum(np.abs(np.array([np.arange(n)]).transpose()-np.arange(n)),
                                            axis=0) )
# k * Σ_i P(x_i) * Σ_j |x_i-x_j|

In [None]:
tfidf = networks[topic].graph.graph['tfidf']
rvs = lambda n: tfidf.data[np.random.choice(tfidf.data.size, size=n)]
emp = np.mean(np.sum(np.abs(rvs((1,100000))-rvs((1,100000))), axis=0))

In [None]:
h,e,_ = bin_distribution(tfidf.data, 100, 'linear')
p_x = h/len(tfidf.data)
x = np.array([np.average([e[:-1],e[1:]], axis=0)])
the = np.sum(p_x * np.sum(np.abs(p_x*x.transpose() - x), axis=0))
emp, the

## Run models

### Initialize

In [None]:
from IPython.display import display, HTML
import os
import dill
import datetime

In [None]:
models = {}
stats = pd.DataFrame()
for topic, network in networks.items():
    models[topic] = []

In [None]:
first_n_nodes = 10
start_condition = lambda m: [n for n in m.graph_parent.nodes
                             if m.graph_parent.nodes[n]['year'] <=\
                                 sorted(list(nx.get_node_attributes(m.graph_parent, 'year')\
                                               .values()))[first_n_nodes]]
end_condition = lambda m: (len(m.graph.nodes) >= len(m.graph_parent.nodes)) or \
                          (m.year > 2500)
n_seeds = 2
n_models = 1

In [None]:
base_dir = os.path.join('/', 'Users', 'harangju', 'Developer', 'data', 'wiki', 'simulations')
save_models = True
base_dir

### Run models

In [None]:
import copy
_networks = copy.deepcopy(networks)
_networks.pop('anatomy', None)
_networks.pop('biochemistry', None)
_networks

In [None]:
for topic, network in _networks.items():
    print(topic)
    print('\tAnalyzing priors...')
    tfidf = network.graph.graph['tfidf']
    yd = year_diffs(network.graph)
    md = word_diffs(network.graph, tfidf)
    a_md, b_md, r_md, p_md, stderr = sp.stats.linregress(np.abs(yd), md)
    swd = sum_abs_weight_differences(network.graph, tfidf)
    a_swd, b_swd, r_swd, p_swd, stderr = sp.stats.linregress(np.abs(yd), swd)
    rvs = lambda n: tfidf.data[np.random.choice(tfidf.data.size, size=n)]
    mu_sawd = np.mean(np.sum(np.abs(rvs((1,100000))-rvs((1,100000))), axis=0))
    nb = neighbor_similarity(network.graph, tfidf)
    mu_nb, std_nb = sp.stats.norm.fit(nb)
    p_point, p_insert, p_delete = a_swd/mu_sawd, a_md/2, a_md/2
    new_stats = pd.DataFrame([[p_point,p_insert,p_delete,
                               a_md,b_md,r_md,p_md,
                               a_swd,b_swd,r_swd,p_swd,
                               mu_sawd,mu_nb,std_nb]],
                             columns=['p_pt', 'p_in', 'p_de',
                                      'a (man)', 'b (man)', 'r (man)', 'p (man)',
                                      'a (swd)', 'b (swd)', 'r (swd)', 'p (swd)',
                                      'mu (sawd)', 'mu (nei)', 'std (nei)'
                                     ])
    display(HTML(new_stats.to_html()))
    stats = pd.concat([stats, new_stats], ignore_index=True)
    for i in range(n_models):
        print(f"\tRunning model {i}...")
        model = wiki.Model(graph_parent=network.graph,
                           vectors_parent=tfidf,
                           year_start=sorted(list(nx.get_node_attributes(network.graph, 'year')\
                                                    .values()))[first_n_nodes],
                           start_nodes=start_condition,
                           n_seeds=n_seeds,
                           dct=dct,
                           point=(1, p_point),
                           insert=(1, p_insert, list(set(tfidf.indices))),
                           delete=(1, p_delete),
                           rvs=rvs,
                           create=lambda n: np.random.normal(loc=mu_nb, scale=std_nb, size=n))
        models[topic].append(model)
        model.evolve(until=end_condition)
    if save_models:
        now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
        dill.dump(models, open(os.path.join(base_dir, f"models_{now}.pickle"), 'wb'))

### Save models

### Load models

In [None]:
import os

base_dir = os.path.join('Users', 'harangju', 'Developer', 'data', 'wiki', 'simulations')


### Compute similarity

In [None]:
sim = lambda a,b: smp.cosine_similarity(a.transpose(), b.transpose())[0,0]
nodes = list(model.graph.nodes)
model.record['Similarity (parent)'] = [sim(model.record.iloc[i]['Seed vectors'], 
                                           model.vectors[:,nodes.index(
                                               model.record.iloc[i]['Parent'])])
                                       for i in range(len(model.record.index))]
model.record

### Interesting thought
If it weren't for the Middle Ages, we would have an amount of knowledge in the 16th Century that is similar to what we have now. But if we run the model after the Dark Ages, the model is accurate (?).

In [None]:
s = lambda a,b: smp.cosine_similarity(a.transpose(), b.transpose())[0,0]
nodes = list(model.graph.nodes)
model.record['Similarity to parent'] = [s(model.record.iloc[i]['Seed vectors'],
                                          model.vectors[:,nodes.index(model.record.iloc[i]['Parent'])])
                                        for i in range(len(model.record.index))]
model.record['Parent seed'] = model.record['Parent'] + ' ' + model.record['Seed number'].map(str)

In [None]:
plt.figure(figsize=(16,10))
ax = sns.lineplot(x='Year', y='Similarity to parent', hue='Parent seed', legend=False,
                  data=model.record)
plt.ylim([0,1.1]);

## Posteriors

In [None]:
import cufflinks as cf
cf.go_offline()
from ipywidgets import interact, widgets, Layout
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from IPython.display import display

### Interactive plots
[FigureWidgets](https://plot.ly/python/v3/figurewidget-app/)

In [None]:
df = cf.datagen.lines(5,1000).reset_index(drop=True)
df

In [None]:
def update(change):
    with fig.batch_update():
        fig.data[0].x = df.index[df.index > change.new]
        fig.data[0].y = df[df.columns[0]][df.index > change.new]
min_idx = min(df.index)
max_idx = max(df.index)

slider = widgets.IntSlider(value=0, min=min_idx, max=max_idx,
                           step=1, description='Year', continuous_update=True,
                           layout=Layout(width='auto'))
slider.observe(update, names='value')
display(slider)

fig = go.FigureWidget()
fig.add_trace(go.Scatter(x=df.index, y=df[df.columns[0]], name=df.columns[0], mode='lines'))
fig.update_layout(title='Title', xaxis_title='Index', yaxis_title='Y', template='plotly_white')
fig


### Plots

#### Degree distribution

**Interpretation**

There are too many connections. Similarity to the parent isn't actually changing that much.

In [None]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=[d for _,d in graph.degree], nbinsx=30, name='empirical'))
fig.add_trace(go.Histogram(x=[d for _,d in model.graph.degree], nbinsx=30, name='model'))
fig.update_layout(title='Degree distribution', template='plotly_white',
                  xaxis_title='degree', yaxis_title='number of edges')

#### Network growth

In [None]:
years = pd.DataFrame([model.graph.nodes[node]['year'] for node in model.graph.nodes],
                     columns=['Year'])\
          .sort_values(by='Year')\
          .reset_index(drop=True)
years['count'] = 1
years['Year (cumsum)'] = years['count'].cumsum()
years = years.drop(columns='count')
years

In [None]:
nodes = list(model.graph.nodes)
layout = nx.kamada_kawai_layout(model.graph, dim=2)
# layout = nx.spring_layout(model.graph, dim=3)
layout = np.vstack([layout[node] for node in nodes])
Xn = [layout[k][0] for k in range(len(nodes))]
Yn = [layout[k][1] for k in range(len(nodes))]
# Zn = [layout[k][2] for k in range(len(nodes))]
Xe = []
Ye = []
# Ze = []
for e in model.graph.edges:
    Xe += [layout[nodes.index(e[0])][0], layout[nodes.index(e[1])][0], None]
    Ye += [layout[nodes.index(e[0])][1], layout[nodes.index(e[1])][1], None]
#     Ze += [layout[nodes.index(e[0])][2], layout[nodes.index(e[1])][2], None]

In [None]:
def graph_layout(graph, nodes):
    subgraph = model.graph.subgraph(nodes)
    layout = nx.kamada_kawai_layout(graph, dim=2)
    Xn = [layout[n][0] for n in subgraph.nodes]
    Yn = [layout[n][1] for n in subgraph.nodes]
    Xe = []
    Ye = []
    for e in subgraph.edges:
        Xe += [layout[e[0]][0], layout[e[1]][0], None]
        Ye += [layout[e[0]][1], layout[e[1]][1], None]
    return (Xn, Yn), (Xe, Ye)

In [None]:
years_emp = np.array(sorted([graph.nodes[n]['year'] for n in graph.nodes]))
years_emp_dist = np.cumsum(np.ones(shape=len(years_emp)))
len(years_emp), len(years_emp_dist)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=years_emp,
                         y=years_emp_dist,
                         name='empirical'))
fig.add_trace(go.Scatter(x=years.Year,
                         y=years['Year (cumsum)'],
                         name='model'))
fig.update_layout(title='Discoveries',
                  xaxis_title='Year',
                  yaxis_title='Number of discoveries',
                  template='plotly_white')

In [None]:
# fig.write_image('fig1.svg');

In [None]:
def update_network(change):
    with fig.batch_update():
        (Xn, Yn), (Xe, Ye) = graph_layout(model.graph,
                                          [n for n in model.graph.nodes 
                                           if model.graph.nodes[n]['year']<=change.new])
        fig.data[0].x = Xe
        fig.data[0].y = Ye
        fig.data[1].x = Xn
        fig.data[1].y = Yn
        fig.layout.title = model.graph.name + ', year: ' + str(change.new)
        fig.update_xaxes(range=[-1.2,1.2])
        fig.update_yaxes(range=[-1.2,1.2])

nodes = list(model.graph.nodes)

min_year = min([model.graph.nodes[n]['year'] for n in nodes])
max_year = max([model.graph.nodes[n]['year'] for n in nodes])
slider_network = widgets.IntSlider(value=min_year, min=min_year, max=max_year,
                                   step=1, description='Year', continuous_update=True,
                                   layout=Layout(width='auto'))
slider_network.observe(update_network, names='value')
display(slider_network)

(Xn, Yn), (Xe, Ye) = graph_layout(model.graph,
                                  [n for n in model.graph.nodes 
                                   if model.graph.nodes[n]['year']==min_year])

trace1 = go.Scatter(x=Xe, y=Ye,# z=Ze, 
                    mode='lines', line=dict(color='gray', width=.5),
                    hoverinfo='none')
trace2 = go.Scatter(x=Xn, y=Yn,# z=Zn, 
                      mode='markers',
                      marker=dict(symbol='circle', size=6,
#                                color=group,
                                  colorscale='Viridis',
                                  line=dict(color='rgb(50,50,50)', width=0.5)),
                      text=nodes, hoverinfo='text')
axis = dict(showbackground=False,
            showline=False,
            zeroline=False,
            showgrid=False,
            showticklabels=False,
            title='')
fig = go.Figure(data=[trace1, trace2],
                layout=go.Layout(title=topic + ', year: ' + str(min_year),
                                 width=600,#1000,
                                 height=600,
                                 showlegend=False,
                                 scene=dict(xaxis=dict(axis),
                                            yaxis=dict(axis),
                                            zaxis=dict(axis),),
                                 hovermode='closest',
                                 template='plotly_white'))
fig = go.FigureWidget(fig)
fig.update_xaxes(range=[-1.2,1.2])
fig.update_yaxes(range=[-1.2,1.2])
fig

**Comments**

Too many connections in new nodes. So, try
* restricting title words to uncommon words?

#### Similarity

##### Cosine similarity

In [None]:
import itertools as it

sim = lambda a,b: smp.cosine_similarity(a.transpose(), b.transpose())[0,0]

nodes = list(model.graph.nodes)
births = pd.DataFrame({'Node': nodes,
                       'Year': [model.graph.nodes[n]['year'] for n in nodes]})\
           .sort_values(by=['Year'])\
           .reset_index(drop=True)
births['Similarity (neighbor)'] = [[sim(model.vectors[:,nodes.index(births.iloc[i].Node)],
                                        model.vectors[:,nodes.index(neighbor)])
                                    for neighbor in it.chain(model.graph.successors(births.iloc[i].Node),
                                                             model.graph.predecessors(births.iloc[i].Node))
                                    if model.graph.nodes[neighbor]['year'] <= births.iloc[i].Year]
                                   for i in births.index]
births

In [None]:
max_y = 180

def update_similarity(change):
    with fig.batch_update():
        fig.data[1].x = [j for i in births[births.Year<=change.new]['Similarity (neighbor)']
                         for j in i]
        fig.data[2].x = model.record['Similarity (parent)'][model.record.Year == change.new]
        fig.update_xaxes(range=[0,1])
        fig.update_yaxes(range=[0,max_y])

min_year = min(model.record.Year)
max_year = max(model.record.Year)
slider = widgets.IntSlider(value=min_year, min=min_year, max=max_year,
                           step=1, description='Year', continuous_update=True,
                           layout=Layout(width='auto'))
slider.observe(update_similarity, names='value')
display(slider)

fig = go.FigureWidget()
fig.add_trace(go.Histogram(x=neighbors,
                           name='empirical'))
fig.add_trace(go.Histogram(x=[j for i in births[births.Year<=min_year+50]['Similarity (neighbor)']
                              for j in i],
                           name='model (neighbor)'))
fig.add_trace(go.Histogram(x=model.record[model.record.Year==min_year]['Similarity (parent)'],
                           name='model (parent)'))
fig.update_layout(title='Cosine similarity', template='plotly_white',
                  xaxis_title='cosine similarity', yaxis_title='number of edges')
fig.update_xaxes(range=[0,1])
fig.update_yaxes(range=[0,max_y])
fig


##### Manhattan distance

#### Something

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
sns.distplot(neighbors)
x = np.linspace(min(neighbors), max(neighbors), 100)
mu, std = sp.stats.norm.fit(neighbors)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
sns.distplot(non_neighbors)
plt.title(topic + ' (prior)')
plt.legend([f"fit-neighbors (m={mu:.2f}; s={std:.2f})", 'neighbors', 'non-neighbors'])
plt.xlabel('cos similarity');
plt.xlim([-.2,1.2])
plt.subplot(122)
neighbors_model = neighbor_similarity(model.graph, model.vectors)
non_neighbors_model = non_neighbor_similarity(model.graph, model.vectors)
sns.distplot(neighbors_model)
x = np.linspace(min(neighbors_model), max(neighbors_model), 100)
mu, std = sp.stats.norm.fit(neighbors_model)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
sns.distplot(non_neighbors_model)
plt.title(topic + ' (model)')
plt.legend([f"fit-neighbors (m={mu:.2f}; s={std:.2f})", 'neighbors', 'non-neighbors'])
plt.xlabel('cos similarity')
plt.xlim([-.2,1.2]);

In [None]:
plt.figure(figsize=(16,4))
plt.subplot(121)
bin_size=25
years = [graph.nodes[node]['year'] for node in graph.nodes]
sns.distplot(years, bins=bin_size, rug=True, kde=False)
hist, bin_edges = np.histogram(years, bins=bin_size)
popt, pcov = sp.optimize.curve_fit(lambda x,a,b: a*pow(b,x), bin_edges[1:], hist)
x = np.linspace(min(years), max(years), 100)
sns.lineplot(x=x, y=popt[0]*pow(popt[1],x))
plt.legend([f"a*b^x; a={popt[0]:.1e}, b={popt[1]:.4f}"])
plt.title('prior')
plt.ylabel('discoveries')
plt.xlabel('year');

plt.subplot(122)
years = [model.graph.nodes[node]['year'] for node in model.graph.nodes]
sns.distplot(years, bins=bin_size, rug=True, kde=False)
hist, bin_edges = np.histogram(years, bins=bin_size)
popt, pcov = sp.optimize.curve_fit(lambda x,a,b: a*pow(b,x), bin_edges[1:], hist)
x = np.linspace(min(years), max(years), 100)
sns.lineplot(x=x, y=popt[0]*pow(popt[1],x))
plt.legend([f"a*b^x; a={popt[0]:.1e}, b={popt[1]:.4f}"])
plt.title('model')
plt.ylabel('discoveries')
plt.xlabel('year');

plt.figure(figsize=(16,4))
bin_size=25
years = [graph.nodes[node]['year'] for node in graph.nodes]
sns.distplot(years, bins=bin_size, rug=True, kde=False, hist=False)
# hist, bin_edges = np.histogram(years, bins=bin_size)
# popt, pcov = sp.optimize.curve_fit(lambda x,a,b: a*pow(b,x), bin_edges[1:], hist)
# x = np.linspace(min(years), max(years), 100)
# sns.lineplot(x=x, y=popt[0]*pow(popt[1],x))
sns.lineplot(x=sorted(years),
             y=np.sum(np.array([sorted(years)]).transpose() < np.array([sorted(years)]), axis=0))

years = [model.graph.nodes[node]['year'] for node in model.graph.nodes]
sns.distplot(years, bins=bin_size, rug=True, kde=False, hist=False)
hist, bin_edges = np.histogram(years, bins=bin_size)
# popt_model, pcov = sp.optimize.curve_fit(lambda x,a,b: a*pow(b,x), bin_edges[1:], hist)
# x = np.linspace(min(years), max(years), 100)
# sns.lineplot(x=x, y=popt_model[0]*pow(popt_model[1],x))
sns.lineplot(x=sorted(years),
             y=np.sum(np.array([sorted(years)]).transpose() < np.array([sorted(years)]), axis=0))

plt.legend([#f"prior: a*b^x; a={popt[0]:.1e}, b={popt[1]:.4f}",
            f"prior: count",
            #f"model: a*b^x; a={popt_model[0]:.1e}, b={popt_model[1]:.4f}",
            f"model: count"])
plt.ylabel('discoveries')
plt.xlabel('year');

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(121)
fit.plot_pdf()
fit.power_law.plot_pdf()
plt.title(f"empirical xmin={fit.xmin:.1e}, α={fit.alpha:.1f}");
plt.subplot(122)
fit_model = powerlaw.Fit(model.vectors.data)
fit_model.plot_pdf()
fit_model.power_law.plot_pdf()
plt.title(f"model xmin={fit_model.xmin:.1e}, α={fit_model.alpha:.1f}");

In [None]:
sns.jointplot(x=np.abs(yd), y=wd, kind='reg',
              marginal_kws=dict(bins=15, rug=True))
plt.xlabel('Δyear')
plt.ylabel('manhattan distance');

In [None]:
n_rows = 4
plt.figure(figsize=(16,n_rows*6))

# wd = word_diffs(graph, tfidf)
# yd = year_diffs(graph)

plt.subplot(n_rows,2,1)
sns.distplot(yd)
plt.title(topic + ' prior')
plt.xlabel('year difference')

plt.subplot(n_rows,2,2)
yd_model = year_diffs(model.graph)
sns.distplot(yd_model)
plt.title(topic + ' model')
plt.xlabel('year difference');

plt.subplot(n_rows,2,3)
sns.scatterplot(x=np.abs(yd), y=wd)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd), wd)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (prior)")
plt.xlabel('year')
plt.ylabel('manhattan distance');

plt.subplot(n_rows,2,4)
sns.distplot(wd)
mu, std = sp.stats.norm.fit(wd)
x = np.linspace(min(wd), max(wd), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.xlabel('manhattan distance')
plt.ylabel('probability distribution');
plt.title(f"μ={mu:.2}, σ={std:.2} (prior)")

wd_model = word_diffs(model.graph, model.vectors)
yd_model = year_diffs(model.graph)
neighbors_model = neighbor_similarity(model.graph, model.vectors)

plt.subplot(n_rows,2,5)
sns.scatterplot(x=np.abs(yd_model), y=wd_model)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd_model), wd_model)
x = np.linspace(0, max(np.abs(yd_model)), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (model)")
plt.xlabel('year')
plt.ylabel('manhattan distance');

plt.subplot(n_rows,2,6)
sns.distplot(wd_model)
mu, std = sp.stats.norm.fit(wd_model)
x = np.linspace(min(wd_model), max(wd_model), 100)
plt.plot(x, sp.stats.norm.pdf(x, mu, std))
plt.xlabel('manhattan distance')
plt.ylabel('probability distribution');
plt.title(f"μ={mu:.2}, σ={std:.2} (model)");

plt.subplot(n_rows,2,7)
sns.scatterplot(x=np.abs(yd), y=neighbors)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd), neighbors)
x = np.linspace(0, max(yd), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (prior)")
plt.xlabel('Δyear')
plt.ylabel('cosine similarity');

plt.subplot(n_rows,2,8)
sns.scatterplot(x=np.abs(yd_model), y=neighbors_model)
slope, intercept, r, p, stderr = sp.stats.linregress(np.abs(yd_model), neighbors_model)
x = np.linspace(0, max(np.abs(yd_model)), 100)
sns.lineplot(x, np.multiply(slope, x) + intercept)
plt.title(f"slope={slope:.2f}; r={r:.2f}; p={p:.1e} (model)")
plt.xlabel('Δyear')
plt.ylabel('cosine similarity');

In [None]:
plt.figure(figsize=(16,6))

plt.subplot(121)
sns.scatterplot(x='index', y='weight',
                data=pd.DataFrame({'index': model.vectors.indices,
                                   'weight': model.vectors.data}))
plt.ylim([-.1,1.1]);

plt.subplot(122)
plot_distribution(model.vectors.data)

In [None]:
plt.figure(figsize=(16,6))
plt.subplot(121)
nx.draw_networkx(graph, node_color=['r' if graph.nodes[n]['year']<-500 else 'b'
                                    for n in graph.nodes])
plt.title('original graph')
plt.subplot(122)
nx.draw_networkx(model.graph, node_color=['r' if model.graph.nodes[n]['year']<-500 else 'b'
                                          for n in model.graph.nodes])
plt.title('new graph');

In [None]:
plt.figure(figsize=(16,6))
sns.distplot([d for _,d in graph.degree], bins=30)
sns.distplot([d for _,d in model.graph.degree], bins=30)
plt.legend(['prior', 'model'])
plt.xlim([-10,110]);

### Discussion

The point of this model is that one can model knowledge discovery as incremental changes on existing knowledge.

The mutation model doesn't monotonically decrease similarity with parent.