# Load networks

In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import numpy as np
import pandas as pd
import networkx as nx
import scipy as sp

In [None]:
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
          'energy', 'optics', 'earth science', 'geology', 'meteorology',
          'philosophy of language', 'philosophy of law', 'philosophy of mind',
          'philosophy of science', 'economics', 'accounting', 'education',
          'linguistics', 'law', 'psychology', 'sociology', 'electronics',
          'software engineering', 'robotics',
          'calculus', 'geometry', 'abstract algebra',
          'Boolean algebra', 'commutative algebra', 'group theory', 'linear algebra',
          'number theory', 'dynamical systems and differential equations']

In [None]:
path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'

In [None]:
# topics = ['earth science']

In [None]:
import wiki

networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(path_graph=path_saved + topic + '.pickle',
                               path_barcodes=path_saved + topic + '.barcode')

In [None]:
path_null = '/Users/harangju/Developer/data/wiki/graphs/null-target/'
num_nulls = 10
null_targets = {}
for topic in topics:
    print(topic, end=' ')
    null_targets[topic] = [None for i in range(num_nulls)]
    for i in range(num_nulls):
        null_targets[topic][i] = wiki.Net(path_graph=path_null + topic + '-null-' + str(i) + '.pickle',
                                          path_barcodes=path_null + topic + '-null-' + str(i) + '.barcode')

In [None]:
path_null = '/Users/harangju/Developer/data/wiki/graphs/null-year/'
num_nulls = 10
null_years = {}
for topic in topics:
    print(topic, end=' ')
    null_years[topic] = [None for i in range(num_nulls)]
    for i in range(num_nulls):
        null_years[topic][i] = wiki.Net(path_graph=path_null + topic + '-null-' + str(i) + '.pickle',
                                        path_barcodes=path_null + topic + '-null-' + str(i) + '.barcode')

# Network statistics

In [None]:
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

In [None]:
# path_fig = '/Users/harangju/Box Sync/Research/my papers/wikipedia/results/'
save_fig = False

In [None]:
import pickle
import pandas as pd
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
df = pickle.load(open(path_analysis+'stats.pickle', 'rb'))
df_expand = pickle.load(open(path_analysis+'stats_expand.pickle', 'rb'))
df.topic = df.topic.astype('object')
df.measure = df.measure.astype('object')
df_expand.topic = df_expand.topic.astype('object')
df_expand.measure = df_expand.measure.astype('object')

In [None]:
df_mean = df_expand\
    .groupby(['topic', 'measure'], as_index=False)\
    .mean()\
    .pivot(index='topic', columns='measure', values='value')\
    .reset_index()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_mean['coreness-null-target'],
                         y=df_mean['coreness'],
                         mode='markers+text',
                         name='coreness',
                         text=df_mean['topic'],
                         textposition='top left'))
fig.add_trace(go.Scatter(x=[0,1], y=[0,1],
                         mode='lines',
                         line=dict(dash='dash'),
                         name='1:1'))
fig.update_layout(template='plotly_white',
                  title='coreness',
                  width=900, height=900,
                  xaxis=dict(title='null',
                             range=[0,1]),
                  yaxis=dict(title='real',
                             range=[0,1],
                             scaleanchor='x',
                             scaleratio=1))
fig.show()
# if save_fig:
#     fig.write_image(f"{path_fig}/{path_plot}/coreness.pdf")

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=df_mean['modularity-null-target'],
                         y=df_mean['modularity'],
                         mode='markers+text',
                         name='modularity',
                         text=df_mean['topic'],
                         textposition='bottom right'))
fig.add_trace(go.Scatter(x=[0,1], y=[0,1],
                         mode='lines',
                         line=dict(dash='dash'),
                         name='1:1'))
fig.update_layout(template='plotly_white',
                  title='modularity',
                  width=900, height=900,
                  xaxis=dict(title='null',
                             range=[0,1]),
                  yaxis=dict(title='real',
                             range=[0,1],
                             scaleanchor='x',
                             scaleratio=1))
fig.show()
# if save_fig:
#     fig.write_image(f"{path_fig}/{path_plot}/modularity.pdf")

In [None]:
# path_big = os.path.join('/','Users','harangju','Box Sync','Research','my papers','wikipedia',
#                         'results','0 graphs','gexf','big_graph.gexf')
# big_net = wiki.Net(path_graph=path_big)
# big_net

In [None]:
# wiki.Net.assign_core_periphery(big_net.graph)

# Cavity statistics

More dense connections
In harder sciences?

In [None]:
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
# path_fig = '/Users/harangju/Box Sync/Research/my papers/wikipedia/results/'

In [None]:
barcodes = pd.concat([network.barcodes.assign(topic=topic)\
                                      .assign(type='real')\
                                      .assign(null=0)
                      for topic, network in networks.items()] +
                     [network.barcodes.assign(topic=topic)\
                                      .assign(type='targets')\
                                      .assign(null=i)
                      for topic, nulls in null_targets.items()
                          for i, network in enumerate(nulls)] +
                     [network.barcodes.assign(topic=topic)\
                                      .assign(type='years')\
                                      .assign(null=i)
                      for topic, nulls in null_years.items()
                          for i, network in enumerate(nulls)],
                     ignore_index=True, sort=False)

In [None]:
pd.options.display.max_rows = 5

In [None]:
null_count = barcodes\
    .groupby(['type','topic','dim'], as_index=False)['null'].max()
null_count.null = null_count.null + 1
null_count

In [None]:
dim_count = barcodes\
    .assign(count=1)\
    .groupby(['type','topic','dim'], as_index=False)['count'].sum()\
    .sort_values('type', axis=0, ascending=True)
dim_count

In [None]:
dims = pd\
    .merge(dim_count, null_count, how='left', left_on=['type','topic','dim'], right_on=['type','topic','dim'])\
    .sort_values(by=['type','topic','dim'])\
    .reset_index(drop=True)\
    .rename(columns={'count': 'dim_count', 'null': 'null_count'})
dims['dim_count_norm'] = dims['dim_count'] / dims.null_count
dims

In [None]:
fig = px.box(dims[dims.type=='real'], x='dim', y='dim_count_norm')
for topic in pd.unique(dims['topic']):
    data = dims[(dims['type']=='real') & (dims['topic']==topic)].sort_values(by='dim')
    fig.add_trace(go.Scatter(
        x=data['dim'], y=data['dim_count_norm'], name=topic,
        mode='markers+lines', visible='legendonly'
    ))
fig.update_layout(template='plotly_white',
                  title_text='Dimensionality',
                  yaxis={'range': [0,10000]})
fig.update_traces(marker={'size': 3}, line={'width': 1})
fig.show()
# fig.write_image(os.path.join(path_fig, path_plot, 'dimensionality_real.pdf'))

In [None]:
pd.options.display.max_rows = 10
dim_stats = pd.DataFrame(dims\
    .groupby(['type','topic'])['dim_count_norm'].idxmax())\
    .reset_index()\
    .rename(columns={'dim_count_norm': 'dim_mode_idx'})
dim_stats['mode'] = dims.iloc[dim_stats['dim_mode_idx']]['dim'].values
dim_stats = dim_stats.drop('dim_mode_idx', axis=1)
dim_stats

In [None]:
avg = dims\
    .assign(dimXcount=dims['dim'] * dims['dim_count_norm'])\
    .groupby('topic', as_index=False)['dim_count_norm','dimXcount'].sum()\
    .rename(columns={'dim_count_norm': 'dim_count_norm_sum'})
dim_stats['mean'] = avg['dimXcount'] / avg['dim_count_norm_sum']

In [None]:
pd.options.display.max_rows = 111
dim_stats[dim_stats['type']=='real']\
    .sort_values(['mean'])\
    .reset_index()\
    .drop(['type','index'], axis=1)

Can we say that the dimensionality of cavities reveals the complexity of the information?

# Big network

In [None]:
big_net = wiki.Net(path_graph=os.path.join(path_saved, 'big_network.pickle'))

In [None]:
A = nx.adjacency_matrix(big_net.graph)
A

In [None]:
big_net.graph['Commercial law']['Unfair competition']

In [None]:
big_net.graph['Unfair competition']

In [None]:
list(big_net.graph.nodes).index('Commercial law'), list(big_net.graph.nodes).index('Unfair competition')

In [None]:
a[0,1], a[1,0]

# Communicability

In [None]:
def communicability(A):
    pass

# Controllability

Is there a spectrum of controllability in nodes & topics (as summarized over nodes) from "pure" to "applied" fields?

Make sure to check outdegree.

Notation: `a[i,j]` means that `i` points to `j` and that page `i` is linked to from page `j` in Wikipedia

In [None]:
import scipy as sp
import control

In [None]:
def gramians(A, M):
    '''
    
    Parameters
    ----------
    A: scipy.sparse.csc_matrix or csr_matrix
        turns csr_matrix into csc_matrix
        A[i,j] should have j->i
    M: int
    
    Returns
    -------
    CG, OG: scipy.sparse.csc_matrix
        controllability & observability Gramians
    '''
    if isinstance(A, sp.sparse.csr_matrix):
        A = A.transpose()
    val, vec = sp.sparse.linalg.eigs(a.transpose())
    # pre-calculate A^m and (A^T)^m
    Anorm = A / (1 + val[0])
    AnormT = Anorm.transpose().tocsc()
    Am = [sp.sparse.identity(A.shape[0], dtype=np.float64, format='csc'), Anorm]
    ATm = [sp.sparse.identity(A.shape[0], dtype=np.float64, format='csc'), AnormT]
    for m in range(2,M+1):
        Am += [Am[-1] * Anorm]
        ATm += [ATm[-1] * AnormT]
    # calculate controllability & observability Gramians
    CG = Am[0] * ATm[0]
    OG = ATm[0] * Am[0]
    for m in range(1,M+1):
        print('G ' + str(m))
        CG += Am[m] * ATm[m]
        OG += ATm[m] * Am[m]
    return CG, OG

In [None]:
CG, OG = gramians(A, 1)
len(CG.diagonal()[np.nonzero(CG.diagonal()-1)[0]]), np.nonzero(CG.diagonal()-1)[0]

In [None]:
M = 3
nodes = list(big_net.graph.nodes)
grams = pd.DataFrame({'node': nodes})
for m in range(1,M+1):
    print(f"m={m}")
    CG, OG = gramians(A, m)
    grams[f"CG_{m}"] = CG.diagonal()
    grams[f"OG_{m}"] = OG.diagonal()

In [None]:
del CG, OG

In [None]:
grams

In [None]:
grams = grams.set_index('node')

In [None]:
import pickle
path_save = os.path.join('/','Users','harangju','Developer','data','wiki','analysis')

In [None]:
pickle.dump(grams, open(os.path.join(path_save, 'grams.pickle'), 'wb'))

In [None]:
grams = pickle.load(open(os.path.join(path_save, 'grams.pickle'), 'rb'))

In [None]:
pd.options.display.max_rows = 100
grams

In [None]:
# pd.options.display.max_rows = 100
num_rows = 20
pd.DataFrame(np.concatenate((
    [grams.sort_values('OG_1', ascending=False).iloc[0:num_rows].index.values],
    [grams.sort_values('OG_2', ascending=False).iloc[0:num_rows].index.values],
    [grams.sort_values('OG_3', ascending=False).iloc[0:num_rows].index.values]),
    axis=0).transpose(),
            columns=['OG_1','OG_2','OG_3'])

In [None]:
# pd.options.display.max_rows = 100
num_rows = 20
pd.DataFrame(np.concatenate((
    [grams.sort_values('CG_1', ascending=False).iloc[0:num_rows].index.values],
    [grams.sort_values('CG_2', ascending=False).iloc[0:num_rows].index.values],
    [grams.sort_values('CG_3', ascending=False).iloc[0:num_rows].index.values]),
    axis=0).transpose(),
            columns=['CG_1','CG_2','CG_3'])

In [None]:
grams_topics = pd.DataFrame()
for topic in topics:
    vals = {key: 0 for key in grams.columns.values}
    for key in vals:
        vals[key] = np.mean([grams.loc[node][key] for node in networks[topic].graph.nodes])
    grams_topics = pd.concat([grams_topics,
                              pd.DataFrame([[topic] + [v for k,v in vals.items()]], 
                                           columns=['topic']+list(vals.keys()))
                             ])
grams_topics = grams_topics.set_index('topic')

In [None]:
grams_topics.sort_values('OG_1', ascending=False)

# Flow analysis