# Load

## Import

In [None]:
%reload_ext autoreload
%autoreload 2
%reload_ext cython
%reload_ext line_profiler
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))
import wiki
import dill
import scipy as sp
import numpy as np
import pandas as pd
import networkx as nx

In [None]:
from ipywidgets import interact, widgets, Layout
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff

## Networks

In [None]:
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
          'energy', 'optics', 'earth science', 'geology', 'meteorology',
          'philosophy of language', 'philosophy of law', 'philosophy of mind',
          'philosophy of science', 'economics', 'accounting', 'education',
          'linguistics', 'law', 'psychology', 'sociology', 'electronics',
          'software engineering', 'robotics',
          'calculus', 'geometry', 'abstract algebra',
          'Boolean algebra', 'commutative algebra', 'group theory', 'linear algebra',
          'number theory', 'dynamical systems and differential equations']

In [None]:
import wiki

path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated'

networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(path_graph=os.path.join(path_saved, topic + '.pickle'),
                               path_barcodes=os.path.join(path_saved, topic + '.barcode'))

## Models

| Run |       ID      | Notes |
|:---:|:--------------|:------|
|  1  | 20200422_1318 |  |
|  2  | 20200520_2057 |  |
|  3  | 20200708_1221 | 10 runs per subject |

In [None]:
simulation = '20200422_1318'
simulation = '20200520_2057'
simulation = '20200820_1919'

In [None]:
base_dir = os.path.join('/', 'Users', 'harangju', 'Developer', 'data', 'wiki', 'simulations')
session_dir = os.path.join(base_dir, simulation)

In [None]:
filenames = sorted(os.listdir(session_dir))
filenames[:3]

In [None]:
filenames[-3:]

In [None]:
model_topics = list(set(
    [filename.split('_')[1] for filename in filenames 
     if filename.split('_')[0]=='model']
))
model_topics[:3]

In [None]:
# models = {topic: [dill.load(open(os.path.join(session_dir, filename), 'rb'))
#                   for filename in filenames
#                   if filename.split('_')[1]==topic]
#                for topic in model_topics}
model_paths = {
    topic: [
        os.path.join(session_dir, filename)
        for filename in filenames[:-1]
        if (filename.split('_')[0]=='model') and (filename.split('_')[1]==topic)
    ]
    for topic in model_topics
}

In [None]:
{topic: model_paths[topic] for topic in model_topics[:1]}

# Basic network statistics

In [None]:
import bct
import pickle
from networkx.algorithms.cluster import clustering
from networkx.algorithms import betweenness_centrality
from networkx.convert_matrix import to_numpy_array

## Model

In [None]:
measures = {'indegree': lambda g: [x[1] for x in g.in_degree],
            'outdegree': lambda g: [x[1] for x in g.out_degree],
            'clustering': lambda g: list(clustering(g).values()),
            'centrality': lambda g: list(betweenness_centrality(g).values()),
            'modularity': lambda g: g.graph['modularity'],
            'coreness': lambda g: g.graph['coreness_be']}

In [None]:
df_model = pd.DataFrame()
for topic in model_paths.keys():
    print(topic)
    network = networks[topic]
    for i, model_path in enumerate(model_paths[topic]):
        print(i, end=' ')
        model = dill.load(open(model_path, 'rb'))
        df_model = pd.concat(
            [df_model] + [pd.DataFrame([[topic, i, measure, func(model.graph)]],
                                columns=['topic','model','measure','value'])
                    for measure, func in measures.items()],
            ignore_index=True)
    print('')

In [None]:
df_model

## Save

In [None]:
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
pickle.dump(df_model, open(os.path.join(path_analysis,f"stats_model_{simulation}.pickle"),'wb'))

## Load

In [None]:
import pickle
import pandas as pd
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'

In [None]:
df = pickle.load(open(path_analysis+'stats.pickle', 'rb'))

In [None]:
df_expand = pickle.load(open(path_analysis+'stats_expand.pickle', 'rb'))
df.topic = df.topic.astype('object')
df.measure = df.measure.astype('object')
df_expand.topic = df_expand.topic.astype('object')
df_expand.measure = df_expand.measure.astype('object')
df_mean = df_expand\
    .groupby(['topic', 'measure'], as_index=False)\
    .mean()\
    .pivot(index='topic', columns='measure', values='value')\
    .reset_index()

In [None]:
df_model = pickle.load(open(os.path.join(path_analysis,f"stats_model_{simulation}.pickle"),'rb'))

In [None]:
df_model_expand = df_model\
    .drop('model', axis=1)\
    .value\
    .apply(pd.Series)\
    .merge(df_model, left_index=True, right_index=True)\
    .drop('value', axis=1)\
    .melt(id_vars=['topic','measure'])\
    .drop('variable', axis=1)\
    .dropna()
df_model_mean = df_model_expand\
    .groupby(['topic', 'measure'], as_index=False)\
    .mean()\
    .pivot(index='topic', columns='measure', values='value')

## Plot

In [None]:
import plotly.subplots as sb
import plotly.graph_objects as go
import plotly.express as px
fig = px.colors.qualitative.swatches()
# fig.show()

In [None]:
path_result = os.path.join(
    '/','Users','harangju','Box Sync','Research','my papers','wikipedia','results'
)

### Growth

In [None]:
first_n_nodes = 10
start_date = 0

In [None]:
path_plot = '3 model growth'

In [None]:
if not os.path.isdir(os.path.join(path_result, path_plot)):
    os.mkdir(os.path.join(path_result, path_plot))

In [None]:
if not os.path.isdir(os.path.join(path_result, path_plot, simulation)):
    os.mkdir(os.path.join(path_result, path_plot, simulation))

In [None]:
save_fig = True

In [None]:
for topic in model_paths.keys():
    fig = sb.make_subplots(1, 2)
    network = networks[topic]
    fig.add_trace(
        go.Histogram(
            x=[d for _,d in network.graph.degree], nbinsx=30, name='empirical'
        ),
        row=1, col=1
    )
    fig.update_yaxes(title_text='number of edges', row=1, col=1)
    fig.update_xaxes(title_text='degree', row=1, col=1)
    fig.add_trace(
        go.Scatter(
            x=sorted([network.graph.nodes[n]['year'] for n in network.graph.nodes]),
            y=list(range(1,len(network.graph.nodes)+1)),
            mode='lines', name='empirical', showlegend=False,
            line={'color': px.colors.qualitative.Plotly[0]}
        ),
        row=1, col=2
    )
    fig.update_yaxes(title_text='number of nodes', row=1, col=2)
    fig.update_xaxes(title_text='year', row=1, col=2)
    fig.update_layout(title=topic, template='plotly_white')
    for i, model_path in enumerate(model_paths[topic]):
        model = dill.load(open(model_path, 'rb'))
        fig.add_trace(
            go.Histogram(
                x=[d for _,d in model.graph.degree], nbinsx=30, 
                name=f"model {i}", marker_color=px.colors.qualitative.Plotly[1+i]
            ),
            row=1, col=1
        )
        fig.add_trace(
            go.Scatter(
                x=sorted([model.graph.nodes[n]['year'] for n in model.graph.nodes]),
                y=list(range(1,len(model.graph.nodes)+1)),
                mode='lines', name=f"model {i}", showlegend=False,
                line={'color': px.colors.qualitative.Plotly[1+i]}
            ),
            row=1, col=2
        )
    fig.add_trace(
        go.Scatter(
            x=[start_date, start_date],
            y=[0, max(len(model.graph.nodes), len(network.graph.nodes))],
            mode='lines',
            name='model start',
            line={'color': 'black', 'dash': 'dash'}
        ),
        row=1, col=2
    )
    fig.show()
    if save_fig:
        fig.write_image(os.path.join(path_result, path_plot, simulation, topic+'.pdf'))

### Static

In [None]:
path_plot = '3 model static'
save_fig = True

In [None]:
if not os.path.isdir(os.path.join(path_result, path_plot)):
    os.mkdir(os.path.join(path_result, path_plot))

In [None]:
if not os.path.isdir(os.path.join(path_result, path_plot, simulation)):
    os.mkdir(os.path.join(path_result, path_plot, simulation))

In [None]:
df_mean = df_mean.sort_values('topic', ascending=True, ignore_index=True)
df_model_mean = df_model_mean.sort_values('topic', ascending=True, ignore_index=True)

In [None]:
ranges = {'clustering': [0,0.3],
          'centrality': [0,0.04],
          'indegree': [0,10],
          'outdegree': [0,10],
          'coreness': [0,2],
          'modularity': [0,1]}
dticks = {'clustering': 0.1,
          'centrality': 0.01,
          'indegree': 1,
          'outdegree': 1,
          'coreness': .5,
          'modularity': .5}

In [None]:
for measure in ['clustering','centrality','indegree','outdegree','coreness','modularity']:
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_mean[measure],
                             y=df_model_mean[measure],
                             mode='markers',
                             marker={'color': '#2A3F5F'},
                             hovertext=df_mean.topic,
                             showlegend=False))
    fig.add_trace(go.Scatter(x=ranges[measure], y=ranges[measure],
                             mode='lines',
                             line={'dash': 'dash',
                                   'color': '#2A3F5F'},
                             showlegend=False))
    fig.update_layout(template='plotly_white',
                      width=500, height=500,
                      title=measure,
                      xaxis={'title': 'real',
                             'range': ranges[measure],
                             'dtick': dticks[measure]},
                      yaxis={'title': 'model',
                             'range': ranges[measure],
                             'dtick': dticks[measure]})
    fig.show()
    if save_fig:
        fig.write_image(os.path.join(path_result, path_plot, simulation, f"summary_{measure}.pdf"))

# Persistent homology

## Real networks

In [None]:
barcodes = pd.concat(
    [
        network.barcodes.assign(topic=topic)\
                        .assign(type='real')\
                        .assign(null=0)
        for topic, network in networks.items()
    ],
    ignore_index=True,
    sort=False
)

## Model

In [None]:
barcodes_models = pd.DataFrame()
for topic in model_paths.keys():
    print(topic)
    for i, model_path in enumerate(model_paths[topic]):
        print(i, end=' ')
        model = dill.load(open(model_path, 'rb'))
        barcodes_models = pd.concat(
            [barcodes_models] +\
            [model.barcodes\
                 .assign(topic=topic)\
                 .assign(type='model')\
                 .assign(model=i)],
            ignore_index=True
        )
    print('')

## Save

In [None]:
path_analysis = os.path.join('/','Users','harangju','Developer','data','wiki','analysis')
pickle.dump(
    barcodes_models,
    open(os.path.join(path_analysis, f"barcodes_models_{simulation}.pickle"), 'wb')
)

## Load

In [None]:
import pickle
path_analysis = os.path.join('/','Users','harangju','Developer','data','wiki','analysis')

In [None]:
barcodes = pickle.load(open(os.path.join(path_analysis, 'barcodes.pickle'),'rb'))

In [None]:
barcodes_models = pickle.load(open(os.path.join(path_analysis, 'barcodes_models.pickle'),'rb'))

## Compute

In [None]:
barcodes_models = barcodes_models[barcodes_models.lifetime!=0]

## Plot

In [None]:
save_fig = True
path_result = os.path.join(
    '/','Users','harangju','Box Sync','Research','my papers','wikipedia','results'
)

### Finite lifetimes

In [None]:
import scipy as sp
lifetime = pd.DataFrame()
for topic in topics:
    t_models, p_models = sp.stats.ttest_ind(
        barcodes[
            (barcodes.topic==topic) &
            (barcodes.lifetime!=np.inf) &
            (barcodes.lifetime!=0)]['lifetime'].values,
        barcodes_models[
            (barcodes_models.topic==topic) &
            (barcodes_models.lifetime!=np.inf) &
            (barcodes_models.lifetime!=0)]['lifetime'].values,
    )
    lifetime = pd.concat(
        [lifetime, pd.DataFrame(
            [[t_models, p_models]],
            columns=['t (targets)','p (targets)']
        )], ignore_index=True
    )

In [None]:
barcodes_mean = barcodes[
    (barcodes.lifetime!=np.inf) & (barcodes.lifetime!=0)]\
    .groupby(['topic', 'type'], as_index=False)\
    .mean()\
    .drop(['dim','birth','death','null'], axis=1)\
    .sort_values('topic')
barcodes_mean

In [None]:
barcodes_models_mean = barcodes_models[
    (barcodes_models.lifetime!=np.inf) & (barcodes_models.lifetime!=0)]\
    .groupby(['topic'], as_index=False)\
    .mean()\
    .drop(['dim','birth','death','model'], axis=1)\
    .sort_values('topic')
barcodes_models_mean

In [None]:
path_plot = '3 model lifetimes'

In [None]:
if not os.path.isdir(os.path.join(path_result, path_plot)):
    os.mkdir(os.path.join(path_result, path_plot))

In [None]:
if not os.path.isdir(os.path.join(path_result, path_plot, simulation)):
    os.mkdir(os.path.join(path_result, path_plot, simulation))

In [None]:
fig = go.Figure()
max_lifetime = max(np.max(barcodes_mean.lifetime), 
                   np.max(barcodes_models_mean.lifetime)) + 10
fig.add_trace(
    go.Scatter(
        x=[0,max_lifetime],
        y=[0,max_lifetime],
        mode='lines',
        line=dict(dash='dash'),
        name='1:1'
    )
)
fig.add_trace(
    go.Scatter(
        x=barcodes_models_mean.lifetime,
        y=barcodes_mean[barcodes_mean.type=='real'].lifetime,
        mode='markers',
        name='model',
        hovertext=barcodes_models_mean.topic
    )
)
fig.update_layout(
    template='plotly_white',
    title='Lifetimes (finite)',
    width=500, height=500,
    xaxis={'title': 'years (null)',
           'range': [0,max_lifetime+100],
           'dtick': 1000},
    yaxis={'title': 'years (real)',
           'range': [0,max_lifetime+100],
           'scaleanchor': 'x',
           'scaleratio': 1,
           'dtick': 1000}
)
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_result, path_plot, simulation, 'finite.pdf'))

### Infinite lifetimes

In [None]:
import scipy as sp
reals = []
models = []
for topic in topics:
    reals.append(barcodes[(barcodes.lifetime==np.inf) &
                          (barcodes.topic==topic) &
                          (barcodes.type=='real')].shape[0])
    models.append(barcodes_models[(barcodes_models.lifetime==np.inf) &
                                  (barcodes_models.topic==topic)].shape[0])
t_models, p_models = sp.stats.ttest_ind(reals, models)
t_models, p_models

In [None]:
import plotly.figure_factory as ff

In [None]:
import os

path_plot = '3 model lifetimes'

if not os.path.exists(os.path.join(path_result, path_plot)):
    os.mkdir(os.path.join(path_result, path_plot))

In [None]:
fig = ff.create_distplot([models, reals],
                         ['models', 'real'],
                         bin_size=300, show_curve=False,
                         colors=['#d62728','#1f77b4'])
                         #colors=['#2ca02c', '#d62728', '#1f77b4'])
fig.update_layout(template='plotly_white',
                      title_text='Lifetimes (infinite)',
                      xaxis={'title': 'count'},
                      yaxis={'title': 'probability'})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_result, path_plot, simulation, 'infinite.pdf'))

### Dimensionality

In [None]:
combined = pd.concat(
    [
        barcodes[barcodes.type=='real'],
        barcodes_models.assign(null=barcodes_models.model).drop('model', axis=1)
    ],
    ignore_index=True
)

In [None]:
counts = combined[(combined.lifetime!=0)]\
    .assign(count=1)\
    .groupby(['type','topic','dim'], as_index=False)['count']\
    .sum()\
    .sort_values('type', axis=0, ascending=True)
counts

In [None]:
nulls = barcodes_models[barcodes_models.lifetime!=0]\
    .groupby(['topic'], as_index=False)['model'].max()
nulls.model = nulls.model + 1
nulls

In [None]:
nulls = pd.merge(nulls, counts,
                 how='left', left_on=['topic'], right_on=['topic'])
nulls['count'] = nulls['count'] / nulls.model
nulls

In [None]:
path_plot = '3 model dimensionality'

In [None]:
if not os.path.exists(os.path.join(path_result, path_plot)):
    os.mkdir(os.path.join(path_result, path_plot))

In [None]:
if not os.path.exists(os.path.join(path_result, path_plot, simulation)):
    os.mkdir(os.path.join(path_result, path_plot, simulation))

In [None]:
fig = px.box(nulls, x='dim', y='count', color='type')
fig.update_layout(template='plotly_white',
                  title_text='Dimensionality',
                  yaxis={'range': [0,2000]})
fig.update_traces(marker={'size': 4})
fig.show()
if save_fig:
    fig.write_image(os.path.join(path_result, path_plot, simulation, 'dimensionality.pdf'))