# Initialization

In [None]:
%reload_ext autoreload
%autoreload 2
%reload_ext cython
%reload_ext line_profiler
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp
import seaborn as sns
import cufflinks as cf
import matplotlib.pyplot as plt

In [None]:
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
          'energy', 'optics', 'earth science', 'geology', 'meteorology',
          'philosophy of language', 'philosophy of law', 'philosophy of mind',
          'philosophy of science', 'economics', 'accounting', 'education',
          'linguistics', 'law', 'psychology', 'sociology', 'electronics',
          'software engineering', 'robotics',
          'calculus', 'geometry', 'abstract algebra',
          'Boolean algebra', 'commutative algebra', 'group theory', 'linear algebra',
          'number theory', 'dynamical systems and differential equations']

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'

In [None]:
networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')

In [None]:
import pickle
dct = pickle.load(open('/Users/harangju/Developer/data/wiki/models/' + 'dict.model','rb'))

# Auxiliary methods

In [None]:
%%cython -f

import numpy as np
cimport numpy as np
from cython cimport floating,boundscheck,wraparound
from cython.parallel import prange

from libc.math cimport fabs

np.import_array()

@boundscheck(False)  # Deactivate bounds checking
@wraparound(False)
def cython_manhattan(floating[::1] X_data, int[:] X_indices, int[:] X_indptr,
                     floating[::1] Y_data, int[:] Y_indices, int[:] Y_indptr,
                     double[:, ::1] D):
    """Pairwise L1 distances for CSR matrices.
    Usage:
    >>> D = np.zeros(X.shape[0], Y.shape[0])
    >>> cython_manhattan(X.data, X.indices, X.indptr,
    ...                  Y.data, Y.indices, Y.indptr,
    ...                  D)
    """
    cdef np.npy_intp px, py, i, j, ix, iy
    cdef double d = 0.0
    
    cdef int m = D.shape[0]
    cdef int n = D.shape[1]
    
    with nogil:                          
        for px in prange(m):
            for py in range(n):
                i = X_indptr[px]
                j = Y_indptr[py]
                d = 0.0
                while i < X_indptr[px+1] and j < Y_indptr[py+1]:
                    if i < X_indptr[px+1]: ix = X_indices[i]
                    if j < Y_indptr[py+1]: iy = Y_indices[j]
                    
                    if ix==iy:
                        d = d+fabs(X_data[i]-Y_data[j])
                        i = i+1
                        j = j+1
                    
                    elif ix<iy:
                        d = d+fabs(X_data[i])
                        i = i+1
                    else:
                        d = d+fabs(Y_data[j])
                        j = j+1
                
                if i== X_indptr[px+1]:
                    while j < Y_indptr[py+1]:
                        iy = Y_indices[j]
                        d = d+fabs(Y_data[j])
                        j = j+1                                            
                else:
                    while i < X_indptr[px+1]:
                        ix = X_indices[i]
                        d = d+fabs(X_data[i])
                        i = i+1
                        
                D[px,py] = d

In [None]:
import sklearn.preprocessing as skp
import sklearn.metrics.pairwise as smp

def year_diffs(graph):
    return [graph.nodes[node]['year'] - graph.nodes[neighbor]['year']
            for node in graph.nodes
            for neighbor in list(graph.successors(node))]

def neighbor_similarity(graph, tfidf):
    nodes = list(graph.nodes)
    return [smp.cosine_similarity(tfidf[:,nodes.index(node)].transpose(),
                                  tfidf[:,nodes.index(neighbor)].transpose())[0,0]
            for node in nodes
            for neighbor in list(graph.successors(node))]

def sparse_manhattan(X,Y=None):
    X, Y = smp.check_pairwise_arrays(X, Y)
    X = sp.sparse.csr_matrix(X, copy=False)
    Y = sp.sparse.csr_matrix(Y, copy=False)
    res = np.empty(shape=(X.shape[0],Y.shape[0]))
    cython_manhattan(X.data,X.indices,X.indptr,
                     Y.data,Y.indices,Y.indptr,
                             res)
    return res

def word_diffs(graph, tfidf):
    dists = sparse_manhattan(X=skp.binarize(tfidf).transpose())
    nodes = list(graph.nodes)
    return [dists[nodes.index(node), nodes.index(neighbor)]
            for node in nodes
            for neighbor in list(graph.successors(node))]

def sum_abs_weight_differences(graph, tfidf):
    nodes = list(graph.nodes)
    diff = []
    for node in nodes:
        for neighbor in graph.successors(node):
            v1 = tfidf[:,nodes.index(node)]
            v2 = tfidf[:,nodes.index(neighbor)]
            idx = np.concatenate([v1.indices, v2.indices])
            diff.append( np.sum(np.absolute(v1[idx]-v2[idx])) )
    return diff

def sum_weight_differences(graph, tfidf):
    nodes = list(graph.nodes)
    diff = []
    for node in nodes:
        for neighbor in graph.successors(node):
            v1 = tfidf[:,nodes.index(node)]
            v2 = tfidf[:,nodes.index(neighbor)]
            idx = np.concatenate([v1.indices, v2.indices])
            diff.append( np.sum(v1[idx]-v2[idx]) )
    return diff

def bin_distribution(data, steps=30, scale='log'):
    if scale=='log':
        bins = np.logspace(np.log10(np.min(data)), np.log10(np.max(data)), steps)
    elif scale=='linear':
        bins = np.linspace(np.min(data), np.max(data), num=steps)
    hist, edges = np.histogram(data, bins=bins)
    return hist, edges, bins

def plot_distribution(data):
    hist, edges, bins = bin_distribution(data)
#     hist_norm = hist/(bins[1:] - bins[:-1])
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=bins[:-1],
                             y=hist/len(data),
                             mode='markers'))
    fig.update_layout(template='plotly_white',
                      xaxis={'type': 'log',
                             'title': 'x'},
                      yaxis={'type': 'log',
                             'title': 'P(x)'})
    fig.show()
    return fig

# Priors - example

In [None]:
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
path_fig = '/Users/harangju/Box Sync/Research/my papers/wikipedia/results/'
save_fig = False

In [None]:
import os

topic = 'anatomy'
path_plot = '3 model ex prior'

if not os.path.exists(os.path.join(path_fig, path_plot, topic)):
    os.mkdir(os.path.join(path_fig, path_plot, topic))

In [None]:
fig = plot_distribution(networks[topic].graph.graph['tfidf'].data)
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'tf_idf_distribution.pdf'))

In [None]:
yd = year_diffs(networks[topic].graph)
wd = word_diffs(networks[topic].graph, networks[topic].graph.graph['tfidf'])
a, b, fit_r, p, stderr = sp.stats.linregress(np.abs(yd), wd)
fig = go.Figure()
x = np.linspace(0, max(yd), 100)
fig.add_trace(go.Scatter(x=np.abs(yd), y=wd,
                         mode='markers',
                         marker={'size': 3},
                         name='edges'))
fig.add_trace(go.Scatter(x=x, y=np.multiply(a, x) + b,
                         name=f"y = {a:.1f} x + {b:.1f}"))
fig.update_layout(template='plotly_white',
                  title=f"{topic} (r = {fit_r:.2f}, p = {p:.1e})",
                  xaxis={'title': 'Δyear'},
                  yaxis={'title': 'manhattan distance'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'manhattan.pdf'))

In [None]:
sum_weight_diffs = sum_weight_differences(networks[topic].graph,
                                          networks[topic].graph.graph['tfidf'])
a, b, fit_r_sum_weight, p, stderr = sp.stats.linregress(np.abs(yd), sum_weight_diffs)
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.abs(yd),
                         y=sum_weight_diffs,
                         mode='markers',
                         marker={'size': 3},
                         name='edges'))
x = np.linspace(0, max(yd), 100)
fig.add_trace(go.Scatter(x=x, y=np.multiply(a, x) + b,
                         name=f"y = {a:.1e} x + {b:.1f}\n"))
fig.update_layout(template='plotly_white',
                  title=f"{topic} (r = {fit_r_sum_weight:.2f}; p = {p:.1e})",
                  xaxis={'title': 'Δyear'},
                  yaxis={'title': 'Σ Δw_i'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'sum_diff_weights.pdf'))

The distribution of word weights are not changing significantly across time.

In [None]:
import plotly.figure_factory as ff
a, b, fit_r_sum_weight, p, stderr = sp.stats.linregress(np.abs(yd), sum_weight_diffs)
mu_swd, std_swd = np.mean(sum_weight_diffs), np.std(sum_weight_diffs)
fig = ff.create_distplot([sum_weight_diffs], ['edges'], bin_size=1)
x = np.linspace(min(sum_weight_diffs), max(sum_weight_diffs), 100)
fig.add_trace(go.Scatter(x=x, y=sp.stats.norm.pdf(x, mu_swd, std_swd),
                         name='normal fit'))
fig.update_layout(template='plotly_white', title=topic,
                  xaxis={'title': 'Σ Δw_i'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'sum_diff_weights_dstr.pdf'))

In [None]:
sum_abs_weight_diffs = sum_abs_weight_differences(networks[topic].graph,
                                                  networks[topic].graph.graph['tfidf'])
a, b, fit_r_sum_abs_weight, p, stderr = sp.stats.linregress(np.abs(yd), sum_abs_weight_diffs)
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.abs(yd),
                         y=sum_abs_weight_diffs,
                         mode='markers',
                         marker={'size': 3},
                         name='edges'))
x = np.linspace(0, max(yd), 100)
fig.add_trace(go.Scatter(x=x, y=np.multiply(a, x) + b,
                         name=f"y = {a:.1e} x + {b:.1f}\n"))
fig.update_layout(template='plotly_white',
                  title=f"{topic} (r = {fit_r_sum_abs_weight:.2f}; p = {p:.1e})",
                  xaxis={'title': 'Δyear'},
                  yaxis={'title': 'Σ |Δw_i|'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'sum_abs_diff_weights.pdf'))

In [None]:
neighbors = neighbor_similarity(networks[topic].graph, networks[topic].graph.graph['tfidf'])
mu_n, std_n = sp.stats.norm.fit(neighbors)
mu_n, std_n

In [None]:
import plotly.figure_factory as ff
fig = ff.create_distplot([neighbors], ['edges'], bin_size=.05)
x = np.linspace(min(neighbors), max(neighbors), 100)
fig.add_trace(go.Scatter(x=x, y=sp.stats.norm.pdf(x, mu_n, std_n),
                         name='normal fit'))
fig.update_layout(template='plotly_white', title=topic,
                  xaxis={'title': 'cosine similarity'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_fig, path_plot, topic, 'cosine_distribution.pdf'))

In [None]:
from gensim.parsing.preprocessing import remove_stopwords
stoplist=set('for a of the and to in'.split())
nodes = []
words = []
graph = networks[topic].graph
tfidf = networks[topic].graph.graph['tfidf']
for i in range(tfidf.shape[1]):
    node = list(graph.nodes)[i]
    if tfidf[:,i].data.size == 0:
#         print(node, tfidf[:,i].data)
        continue
    top_words, idx = wiki.Model.find_top_words(tfidf[:,i], dct, top_n=5)
    nodes += [node]
    words += [top_words]
pd.DataFrame(data={'Node': nodes, 'Top words': words})

In [None]:
n=10
k=4
x = np.sum(np.abs(np.random.randint(0,n,(k,100000))-np.random.randint(0,n,(k,100000))), axis=0)
# np.mean(x), k*2*np.sum(np.arange(1,n)*np.flip(np.arange(1,n))) * ((1/n)**2)
np.mean(x), k * np.sum( ((1/n)**2) * np.sum(np.abs(np.array([np.arange(n)]).transpose()-np.arange(n)),
                                            axis=0) )
# k * Σ_i P(x_i) * Σ_j |x_i-x_j|

In [None]:
tfidf = networks[topic].graph.graph['tfidf']
rvs = lambda n: tfidf.data[np.random.choice(tfidf.data.size, size=n)]
emp = np.mean(np.sum(np.abs(rvs((1,100000))-rvs((1,100000))), axis=0))

In [None]:
h,e,_ = bin_distribution(tfidf.data, 100, 'linear')
p_x = h/len(tfidf.data)
x = np.array([np.average([e[:-1],e[1:]], axis=0)])
the = np.sum(p_x * np.sum(np.abs(p_x*x.transpose() - x), axis=0))
emp, the

# Run models

## Initialize

In [None]:
from IPython.display import display, HTML
import os
import dill
import datetime

In [None]:
first_n_nodes = 10
n_seeds = 2
n_models = 3
start_condition = lambda m: [n for n in m.graph_parent.nodes
                             if m.graph_parent.nodes[n]['year'] <=\
                                 sorted(list(nx.get_node_attributes(m.graph_parent, 'year')\
                                               .values()))[first_n_nodes]]
end_condition = lambda m: (len(m.graph.nodes) >= len(m.graph_parent.nodes)) or \
                          (m.year > 2200)

In [None]:
base_dir = os.path.join('/','Users','harangju','Developer','data','wiki','simulations')
save_models = True
base_dir

## Run models

In [None]:
now = datetime.datetime.now().strftime('%Y%m%d_%H%M')
os.mkdir(os.path.join(base_dir, now))

In [None]:
_networks = {'earth science': networks['earth science']}

In [None]:
stats = pd.DataFrame()

In [None]:
for topic, network in networks.items():
    print(topic)
    print('Analyzing priors...')
    tfidf = network.graph.graph['tfidf']
    yd = year_diffs(network.graph)
    md = word_diffs(network.graph, tfidf)
    a_md, b_md, r_md, p_md, stderr = sp.stats.linregress(np.abs(yd), md)
    swd = sum_abs_weight_differences(network.graph, tfidf)
    a_swd, b_swd, r_swd, p_swd, stderr = sp.stats.linregress(np.abs(yd), swd)
    rvs = lambda n: tfidf.data[np.random.choice(tfidf.data.size, size=n)]
    mu_sawd = np.mean(np.sum(np.abs(rvs((1,100000))-rvs((1,100000))), axis=0))
    nb = neighbor_similarity(network.graph, tfidf)
    mu_nb, std_nb = sp.stats.norm.fit(nb)
    p_point, p_insert, p_delete = a_swd/mu_sawd, a_md/2, a_md/2
    new_stats = pd.DataFrame([[p_point,p_insert,p_delete,
                               a_md,b_md,r_md,p_md,
                               a_swd,b_swd,r_swd,p_swd,
                               mu_sawd,mu_nb,std_nb]],
                             columns=['p_pt', 'p_in', 'p_de',
                                      'a (man)', 'b (man)', 'r (man)', 'p (man)',
                                      'a (swd)', 'b (swd)', 'r (swd)', 'p (swd)',
                                      'mu (sawd)', 'mu (nei)', 'std (nei)'
                                     ])
    display(HTML(new_stats.to_html()))
    stats = pd.concat([stats, new_stats], ignore_index=True)
    for i in range(n_models):
        print(f"Running model {i}...")
        model = wiki.Model(graph_parent=network.graph,
                           vectors_parent=tfidf,
                           year_start=sorted(list(nx.get_node_attributes(network.graph, 'year')\
                                                    .values()))[first_n_nodes],
                           start_nodes=start_condition,
                           n_seeds=n_seeds,
                           dct=dct,
                           point=(1, p_point),
                           insert=(1, p_insert, list(set(tfidf.indices))),
                           delete=(1, p_delete),
                           rvs=rvs,
                           create=lambda n: np.random.normal(loc=mu_nb, scale=std_nb, size=n))
        model.evolve(until=end_condition)
        if save_models:
            dill.dump(model,
                      open(os.path.join(base_dir, now, f"model_{topic}_{i}.pickle"), 'wb'))
    print('')
pickle.dump(stats, open(os.path.join(base_dir, now, 'stats.pickle'), 'wb'))

# Network stats

In [None]:
import dill

In [None]:
simulation = '20200422_1318'
base_dir = os.path.join('/','Users','harangju','Developer','data','wiki','simulations')

In [None]:
session_dir = os.path.join(base_dir, simulation)

In [None]:
filenames = sorted(os.listdir(session_dir))
model_topics = list(set([filename.split('_')[1] for filename in filenames[:-1]]))
model_paths = {topic: [os.path.join(session_dir, filename)
                       for filename in filenames[:-1]
                       if filename.split('_')[1]==topic]
               for topic in model_topics}

In [None]:
for topic in topics:
    print(topic)
    for i, model_path in enumerate(model_paths[topic]):
        print(i, end=' ')
        model = dill.load(open(model_path, 'rb'))
        wiki.Net.assign_core_periphery(model.graph)
        wiki.Net.assign_communities(model.graph)
        print(model.graph.graph['coreness_be'], model.graph.graph['modularity'])
    print('')