Persistent homology examples

* Ripser [paper](https://www.theoj.org/joss-papers/joss.00925/10.21105.joss.00925.pdf) [code](https://github.com/scikit-tda/ripser.py) (fast)
* Dionysus 2 [code](https://mrzv.org/software/dionysus2/) (representative examples)
* Nico's [code](https://github.com/nhchristianson/Math-text-semantic-networks)
* Ann's [code](https://github.com/asizemore/PH_tutorial/blob/master/Tutorial_day1.ipynb)

# Load data

## Load networks

In [None]:
%reload_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))

In [None]:
topics = [
    'anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
    'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
    'energy', 'optics', 'earth science', 'geology', 'meteorology',
    'philosophy of language', 'philosophy of law', 'philosophy of mind',
    'philosophy of science', 'economics', 'accounting', 'education',
    'linguistics', 'law', 'psychology', 'sociology', 'electronics',
    'software engineering', 'robotics',
    'calculus', 'geometry', 'abstract algebra',
    'Boolean algebra', 'commutative algebra', 'group theory', 'linear algebra',
    'number theory', 'dynamical systems and differential equations'
]

In [None]:
import wiki

path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'

networks = {}
for topic in topics:
    print(topic, end=' ')
    networks[topic] = wiki.Net(path_graph=path_saved + topic + '.pickle',
                               path_barcodes=path_saved + topic + '.barcode')

In [None]:
path_null = '/Users/harangju/Developer/data/wiki/graphs/null-target/'
num_nulls = 10
null_targets = {}
for topic in topics:
    print(topic, end=' ')
    null_targets[topic] = [None for i in range(num_nulls)]
    for i in range(num_nulls):
        null_targets[topic][i] = wiki.Net(path_graph=path_null + topic + '-null-' + str(i) + '.pickle',
                                          path_barcodes=path_null + topic + '-null-' + str(i) + '.barcode')

In [None]:
path_null = '/Users/harangju/Developer/data/wiki/graphs/null-year/'
num_nulls = 10
null_years = {}
for topic in topics:
    print(topic, end=' ')
    null_years[topic] = [None for i in range(num_nulls)]
    for i in range(num_nulls):
        null_years[topic][i] = wiki.Net(path_graph=path_null + topic + '-null-' + str(i) + '.pickle',
                                        path_barcodes=path_null + topic + '-null-' + str(i) + '.barcode')

## Load models

In [None]:
simulation = '20200422_1318'
simulation = '20200520_2057'
simulation = '20200820_1919'

In [None]:
base_dir = os.path.join('/', 'Users', 'harangju', 'Developer', 'data', 'wiki', 'simulations')
session_dir = os.path.join(base_dir, simulation)

In [None]:
filenames = sorted(os.listdir(session_dir))
filenames[:3]

In [None]:
filenames[-3:]

In [None]:
model_topics = list(set(
    [filename.split('_')[1] for filename in filenames 
     if filename.split('_')[0]=='model']
))
model_topics[:3]

In [None]:
model_paths = {topic: [os.path.join(session_dir, filename)
                       for filename in filenames[:-1]
                       if (filename.split('_')[0]=='model') and (filename.split('_')[1]==topic)]
               for topic in model_topics}

In [None]:
{topic: model_paths[topic] for topic in model_topics[:1]}

# Compute barcodes

Go to the "Load" sections to load `barcodes` with computed data.

In [None]:
import pandas as pd
pd.options.display.max_rows = 12

In [None]:
import dill
barcodes = pd.concat(
    [
        network.barcodes.assign(topic=topic)\
            .assign(type='real')\
            .assign(null=0)
        for topic, network in networks.items()
    ] +
    [
        network.barcodes.assign(topic=topic)\
            .assign(type='null_targets')\
            .assign(null=i)
        for topic, nulls in null_targets.items()
        for i, network in enumerate(nulls)
    ] +
    [
        network.barcodes.assign(topic=topic)\
            .assign(type='null_years')\
            .assign(null=i)
        for topic, nulls in null_years.items()
        for i, network in enumerate(nulls)
    ] +
    [
        dill.load(open(path, 'rb'))\
            .barcodes\
            .assign(topic=topic)\
            .assign(type='null_genetic')\
            .assign(null=i)
        for topic, paths in model_paths.items()
        for i, path in enumerate(paths)
    ],
    ignore_index=True, sort=False)

In [None]:
barcodes = barcodes[barcodes.lifetime!=0]

In [None]:
barcodes

## Save

In [None]:
import pickle
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
pickle.dump(
    barcodes, 
    open(
        os.path.join(path_analysis, f"barcodes_{simulation}.pickle"),
        'wb'
    )
)

## Load

In [None]:
import pickle
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
barcodes = pickle.load(
    open(
        os.path.join(path_analysis, f"barcodes_{simulation}.pickle"),
        'rb'
    )
)

# Plotting functions

In [None]:
import os
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
# path_fig = '/Users/harangju/Box Sync/Research/my papers/wikipedia/results/'
path_fig = os.path.join(
    '/', 'Users', 'harangju', 'Library', 'Mobile Documents', 'com~apple~CloudDocs', 
    'Documents', 'research', 'wikipedia', 'results'
)

In [None]:
import numpy as np
import pandas as pd
import plotly.offline as po
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
po.init_notebook_mode(connected=True)

def plot_barcodes(barcodes, title):
    fig = go.Figure()
    x = {dim: [] for dim in pd.unique(barcodes.dim)}
    y = {dim: [] for dim in pd.unique(barcodes.dim)}
    deaths = [[],[]]
    for i, row in barcodes.iterrows():
        dim = row['dim']
        birth = row['birth']
        death = row['death'] if row['death']!=np.inf else 2050
        x[dim].extend([birth,death,None])
        y[dim].extend([i,i,None])
        if row['death']!=np.inf:
            deaths[0].extend([death,None])
            deaths[1].extend([i,None])
    for dim in pd.unique(barcodes.dim):
        fig.add_trace(go.Scatter(x=x[dim], y=y[dim],
                                 mode='lines',
                                 name=f"dim={dim}"))
#     fig.add_trace(go.Scatter(x=deaths[0], y=deaths[1], mode='markers',
#                              marker={'color': 'black', 'size': 1},
#                              name='deaths'))
    fig.update_layout(template='plotly_white',
                      width=600, height=500,
                      title_text=f"{title}",
                      xaxis={'title': 'year', 'range': [0, 2040]},
                      yaxis={'title': '', 'tickvals': []})
    fig.show()
    return fig

In [None]:
def plot_persistence_diagram(barcodes):
    colors = [mcd.XKCD_COLORS['xkcd:'+c]
      for c in ['emerald green', 'tealish', 'peacock blue', 
                'grey', 'brown', 'red', 'yellow']]
    plt.figure(figsize=(10,10))
    for dim in set(barcodes['dim']):
        data = barcodes.loc[barcodes['dim']==dim]
        data.loc[data['death']==np.inf,'death'] = 2030
        plt.plot(data['birth'], data['death'], '.')
    x = [barcodes['birth'].min(),
         barcodes.loc[barcodes['death']!=np.inf,'death'].max()]
    print(x)
    plt.plot(x, [2030, 2030], '--')

In [None]:
def plot_betti(barcodes, title):
    fig = go.Figure()
    year_min = int(np.min(barcodes.birth))
    year_max = int(np.max(barcodes[barcodes.death!=np.inf].death))
    counts = np.zeros((len(barcodes.index), year_max-year_min))
    dims = np.zeros(len(barcodes.index))
    for i, row in barcodes.iterrows():
        dim = row['dim']
        birth = int(row['birth'])
        death = int(row['death']) if row['death']!=np.inf else year_max
        counts[i,birth-year_min:death-year_min] = 1
        dims[i] = dim
    for dim in pd.unique(barcodes.dim):
        betti = np.sum(counts[dims==dim,:], axis=0)
        fig.add_trace(go.Scatter(x=np.arange(year_min, year_max) - year_min + 1,
                                 y=betti,
                                 mode='lines',
                                 name=f"dim={dim}"))
    fig.update_layout(template='plotly_white',
                      title_text=f"{title}",
                      xaxis={'title': 'year',
#                              'range': [], #[0,year_max],
                             'type': 'linear'},
                      yaxis={'title': 'count',
#                              'range': [0,2000],
                             'type': 'log'})
    fig.show()
    return fig

# Barcodes

In [None]:
import os

path_plot = '2 barcodes'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
for topic in ['biophysics']:#topics:
    plot_barcodes(
        networks[topic].barcodes[networks[topic].barcodes.lifetime!=0]\
            .reset_index().drop('index', axis=1),
        f"Topic: {topic} (empirical)")\
        .write_image(os.path.join(path_fig, path_plot, f"{topic}_empirical.pdf"))
#     plot_barcodes(null_targets[topic][0].barcodes[null_targets[topic][0].barcodes.lifetime!=0],
#                   f"Topic: {topic} (target-rewired)")#\
#         .write_image(os.path.join(path_fig, path_plot, f"{topic}_target.pdf"))
#     plot_barcodes(null_years[topic][0].barcodes[null_years[topic][0].barcodes.lifetime!=0],
#                   f"Topic: {topic} (year-reordered)")#\
#         .write_image(os.path.join(path_fig, path_plot, f"{topic}_year.pdf"))

# Betti curves

In [None]:
import os

path_plot = '2 betti'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
for topic in ['biochemistry']:#topics:
    b = networks[topic].barcodes
    plot_betti(b[b.lifetime!=0].reset_index(), f"{topic} (empirical)")\
        .write_image(os.path.join(path_fig, path_plot, f"{topic}_empirical.pdf"))
    b = null_targets[topic][0].barcodes
    plot_betti(b[b.lifetime!=0].reset_index(), f"{topic} (target-rewired)")\
#         .write_image(os.path.join(path_fig, path_plot, f"{topic}_target.pdf"))
    b = null_years[topic][0].barcodes
    plot_betti(b[b.lifetime!=0].reset_index(), f"{topic} (year-reordered)")\
#         .write_image(os.path.join(path_fig, path_plot, f"{topic}_year.pdf"))

# Lifetime (finite)

In [None]:
from scipy import stats

In [None]:
lifetime = pd.DataFrame()
for topic in topics:
    data = barcodes[barcodes.topic==topic].copy()
    data = data[(data.lifetime!=np.inf) & (data.lifetime!=0)]
    t_targets, p_targets = stats.ttest_ind(
        data[data.type=='real']['lifetime'].values,
        data[data.type=='null_targets']['lifetime'].values
    )
    t_years, p_years = stats.ttest_ind(
        data[data.type=='real']['lifetime'].values,
        data[data.type=='null_years']['lifetime'].values
    )
    t_genetic, p_genetic = stats.ttest_ind(
        data[data.type=='real']['lifetime'].values,
        data[data.type=='null_genetic']['lifetime'].values
    )
    lifetime = pd.concat(
        [lifetime, pd.DataFrame(
            [[topic, t_targets, p_targets, t_years, p_years, t_genetic, p_genetic]],
            columns=[
                'topic', 't (targets)', 'p (targets)',
                't (years)', 'p (years)',
                't (genetic)', 'p (genetic)'
            ]
        )], ignore_index=True
    )

In [None]:
pd.options.display.max_rows = 37
lifetime

In [None]:
barcodes_mean = barcodes[
    (barcodes.lifetime!=np.inf) & (barcodes.lifetime!=0)]\
    .groupby(['topic', 'type'], as_index=False)\
    .mean()\
    .drop(['dim','birth','death','null'], axis=1)
barcodes_mean

In [None]:
import os

path_plot = '2 lifetimes'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
null_types = [
    'null_targets', 'null_genetic'
]

In [None]:
fig = go.Figure()
max_lifetime = np.max(barcodes_mean.lifetime) + 10
fig.add_trace(
    go.Scatter(
        x=[0,max_lifetime],
        y=[0,max_lifetime],
        mode='lines',
        line=dict(dash='dash'),
        name='1:1'
    )
)
for null_type in null_types:
    fig.add_trace(
        go.Scatter(
            x=barcodes_mean[barcodes_mean.type==null_type].lifetime,
            y=barcodes_mean[barcodes_mean.type=='real'].lifetime,
            mode='markers',
            name=null_type,
            hovertext=barcodes_mean[barcodes_mean.type=='real'].topic
        )
    )
fig.update_layout(template='plotly_white',
                  title='Lifetimes (finite)',
                  width=500, height=500,
                  xaxis={'title': 'years (null)',
                         'range': [0,max_lifetime+100],
                         'dtick': 1000},
                  yaxis={'title': 'years (real)',
                         'range': [0,max_lifetime+100],
                         'scaleanchor': 'x',
                         'scaleratio': 1,
                         'dtick': 1000})
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'finite.pdf'))

In [None]:
import scipy as sp
for null_type in ['null_targets', 'null_genetic']:
    ks, p_ks = sp.stats.ks_2samp(
        barcodes[
            (barcodes.lifetime!=0) & (barcodes.lifetime!=np.inf) &
            (barcodes.type=='real')].lifetime,
        barcodes[
            (barcodes.lifetime!=0) & (barcodes.lifetime!=np.inf) &
            (barcodes.type==null_type)].lifetime,
        alternative='two-sided'
    )
    t, p_t = sp.stats.ttest_ind(
        barcodes[
            (barcodes.lifetime!=0) & (barcodes.lifetime!=np.inf) &
            (barcodes.type=='real')].lifetime,
        barcodes[
            (barcodes.lifetime!=0) & (barcodes.lifetime!=np.inf) &
            (barcodes.type==null_type)].lifetime,
        equal_var=True
    )
    print(null_type, f"ks={ks}, p={p_ks}; t={t}, p={p_t}")

In [None]:
fig = px.violin(
    barcodes[(barcodes.lifetime!=0) & (barcodes.type!='null_years')],
    x='type', y='lifetime'
)
fig.update_layout(
    height=400, width=460,
    template='plotly_white',
    title_text='Lifetimes (finite)',
    xaxis={'title': ''},
    yaxis={
        'title': 'number',
        'type': '-',
    }
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'finite_violin.pdf'))

In [None]:
lifetimes = barcodes[
    (barcodes.lifetime!=0) &
    (barcodes.lifetime!=np.inf)
]
null_types = [
    'real', 'null_targets', 'null_genetic'
]
lifetime_range = np.arange(
    np.min(lifetimes.lifetime),
    np.max(lifetimes.lifetime)
)
cum_freq = {
    null_type: np.zeros(lifetime_range.size)
    for null_type in null_types
}
for null_type in null_types:
    print(null_type)
    for i, lifetime in enumerate(lifetime_range):
        cum_freq[null_type][i] = np.sum(
            lifetimes[lifetimes.type==null_type].lifetime < lifetime
        ) / len(lifetimes[lifetimes.type==null_type].index)

In [None]:
fig = go.Figure()
for null_type in null_types:
    fig.add_trace(
        go.Scatter(
            x=lifetime_range,
            y=cum_freq[null_type],
            name=null_type
        )
    )
fig.update_layout(
    width=400, height=400,
    template='plotly_white',
    yaxis={'title': 'cumulative frequency'},
    xaxis={'title': 'finite lifetime',
           'type': 'log'},
    legend={'x': 0, 'y':1}
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'cum_freq_fin.pdf'))

# Lifetime (infinite)

In [None]:
barcodes[
    (barcodes.lifetime==np.inf) &
    (barcodes.topic=='biochemistry') &
    (barcodes.type=='real')
].shape

In [None]:
import scipy as sp
reals = []
targets = []
years = []
genetics = []
for topic in topics:
    reals.append(barcodes[(barcodes.lifetime==np.inf) &
                          (barcodes.topic==topic) &
                          (barcodes.type=='real')].shape[0])
    targets.append(barcodes[(barcodes.lifetime==np.inf) &
                            (barcodes.topic==topic) &
                            (barcodes.type=='null_targets')].shape[0])
    years.append(barcodes[(barcodes.lifetime==np.inf) &
                          (barcodes.topic==topic) &
                          (barcodes.type=='null_years')].shape[0])
    genetics.append(barcodes[(barcodes.lifetime==np.inf) &
                          (barcodes.topic==topic) &
                          (barcodes.type=='null_genetic')].shape[0])

In [None]:
t_targets, p_targets = sp.stats.ttest_ind(reals, targets)
t_years, p_years = sp.stats.ttest_ind(reals, years)
t_genetic, p_genetic = sp.stats.ttest_ind(reals, genetics)
t_targets, p_targets, t_years, p_years, t_genetic, p_genetic

In [None]:
ks_targets, p_targets = sp.stats.ks_2samp(reals, targets, alternative='two-sided')
ks_years, p_years = sp.stats.ks_2samp(reals, years, alternative='two-sided')
ks_genetic, p_genetic = sp.stats.ks_2samp(reals, genetics, alternative='two-sided')
ks_targets, p_targets, ks_years, p_years, ks_genetic, p_genetic

In [None]:
import plotly.figure_factory as ff

In [None]:
import os

path_plot = '2 lifetimes'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
fig = ff.create_distplot(
    [targets, genetics, reals],
    ['null targets', 'null genetics', 'real'],
    bin_size=np.arange(.5, 8) ** 10, #1000,
    show_curve=False,
    colors=['#2ca02c', '#d62728', '#1f77b4']
)
fig.update_layout(
    width=600,
    template='plotly_white',
    title_text='Lifetimes (infinite)',
    xaxis={'title': 'count', 'type': 'log'},
    yaxis={'title': 'probability'}
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'infinite.pdf'))

In [None]:
fig = px.violin(
    pd.DataFrame({
        'value': reals + targets + genetics,
        'type': len(reals)*['real'] + len(targets)*['target']\
            + len(genetics)*['genetic']
    }).replace(0, 0.01),
    x='type', y='value'
)
fig.update_layout(
    height=400, width=460,
    template='plotly_white',
    title_text='Lifetimes (infinite)',
    xaxis={'title': ''},
    yaxis={
        'title': 'number',
        'type': '-',
#         'range': [-1, 6]
    }
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'infinite_violin.pdf'))

In [None]:
inf_lifetimes = {
    'real': reals,
    'null_targets': targets,
    'null_genetic': genetics
}
null_types = [
    'real', 'null_targets', 'null_genetic'
]
inf_lifetime_range = np.arange(
    0, np.max(reals + targets + genetics)
)
cum_freq = {
    null_type: []
    for null_type in null_types
}
for null_type in null_types:
    print(null_type)
    for i, count in enumerate(inf_lifetime_range):
        cum_freq[null_type].append(
            np.sum(inf_lifetimes[null_type] < count) \
            / len(inf_lifetimes[null_type])
        )

In [None]:
fig = go.Figure()
for null_type in null_types:
    fig.add_trace(
        go.Scatter(
            x=inf_lifetime_range,
            y=cum_freq[null_type],
            name=null_type
        )
    )
fig.update_layout(
    width=400, height=400,
    template='plotly_white',
    yaxis={'title': 'cumulative frequency'},
    xaxis={'title': 'infinite lifetimes',
           'type': 'log'},
    legend={'x':.5, 'y':.2}
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'cum_freq_inf.pdf'))

# Dimensionality

## Compute

In [None]:
pd.options.display.max_rows = 5

In [None]:
counts = barcodes[barcodes.lifetime!=0]\
    .assign(count=1)\
    .groupby(['type','topic','dim'], as_index=False)['count']\
    .sum()\
    .sort_values('type', axis=0, ascending=True)
counts

In [None]:
nulls = barcodes[barcodes.lifetime!=0]\
    .groupby(['type','topic','dim'], as_index=False)['null'].max()
nulls.null = nulls.null + 1
nulls

In [None]:
nulls = pd.merge(
    nulls, counts,
    how='left', left_on=['type','topic','dim'],
    right_on=['type','topic','dim']
).replace(
    ['null_targets','null_years', 'null_genetic'],
    ['targets', 'years', 'genetics']
).sort_values(by='type')
nulls['count'] = nulls['count'] / nulls.null
nulls

## Statistics

In [None]:
import scipy as sp
dim_stat = pd.DataFrame()
for dim in sorted(pd.unique(nulls.dim)):
    nulls_dim = nulls[nulls.dim==dim]
    nulls_dim_count_real = nulls_dim[nulls_dim.type=='real']['count'].values
    nulls_dim_count_years = nulls_dim[nulls_dim.type=='years']['count'].values
    nulls_dim_count_targets = nulls_dim[nulls_dim.type=='targets']['count'].values
    nulls_dim_count_genetics = nulls_dim[nulls_dim.type=='genetics']['count'].values
    t_years, p_years = sp.stats.ttest_ind(
        nulls_dim_count_real, nulls_dim_count_years
    )
    t_targets, p_targets = sp.stats.ttest_ind(
        nulls_dim_count_real, nulls_dim_count_targets
    )
    t_genetics, p_genetics = sp.stats.ttest_ind(
        nulls_dim_count_real, nulls_dim_count_genetics
    )
    ks_years, ks_p_years = sp.stats.ks_2samp(
        nulls_dim_count_real, nulls_dim_count_years, alternative='two-sided'
    ) if len(nulls_dim_count_years) > 0 else (-1, -1)
    ks_targets, ks_p_targets = sp.stats.ks_2samp(
        nulls_dim_count_real, nulls_dim_count_targets, alternative='two-sided'
    ) if len(nulls_dim_count_targets) > 0 else (-1, -1)
    ks_genetics, ks_p_genetics = sp.stats.ks_2samp(
        nulls_dim_count_real, nulls_dim_count_genetics, alternative='two-sided'
    ) if len(nulls_dim_count_genetics) > 0 else (-1, -1)
    dim_stat = pd.concat(
        [
            dim_stat,
            pd.DataFrame(
                [[
                    dim,
                    t_years, p_years, t_targets, p_targets, t_genetics, p_genetics,
                    ks_years, ks_p_years, ks_targets, ks_p_targets, ks_genetics, ks_p_genetics
                ]],
                columns=[
                    'dim',
                    't_years', 'p_years', 't_targets', 'p_targets', 't_genetics', 'p_genetics',
                    'ks_years', 'ks_p_years', 'ks_targets', 'ks_p_targets',
                    'ks_genetics', 'ks_p_genetics'
                ]
            )
        ]
    )

In [None]:
dim_stat

## Plot

In [None]:
import os

path_plot = '2 dimensionality'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
fig = px.box(
    nulls[
        (nulls['type']=='real') &
        (nulls['type']=='targets') &
        (nulls['type']=='genetics')
    ],
    x='dim', y='count', color='type'
)
fig.update_layout(
    template='plotly_white',
    title_text='Dimensionality',
    yaxis={'type': 'log'}
)
fig.update_traces(marker={'size': 4})
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'dimensionality.pdf'))

## Mean dimensionality

In [None]:
barcodes

In [None]:
counts = barcodes[barcodes.lifetime!=0]\
    .assign(count=1)\
    .groupby(['type', 'topic', 'dim'], as_index=False)['count']\
    .sum()
counts

In [None]:
barcodes = barcodes[barcodes.lifetime!=0]
for null_type in ['null_targets', 'null_genetic']:
    ks, p_ks = sp.stats.ks_2samp(
        barcodes[barcodes.type=='real'].dim,
        barcodes[barcodes.type==null_type].dim,
        alternative='two-sided'
    )
    t, p_t = sp.stats.ttest_ind(
        barcodes[barcodes.type=='real'].dim,
        barcodes[barcodes.type==null_type].dim,
        equal_var=True
    )
    print(null_type, f"ks={ks}, p={p_ks}, t={t}, p={p_t}")

In [None]:
fig = px.violin(
    barcodes[barcodes.type!='null_years'], x='type', y='dim'
)
fig.update_layout(
    height=400, width=460,
    template='plotly_white',
    title_text='Dimensionality'
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'dimensionality_violin.pdf'))

In [None]:
dims = barcodes[(barcodes.lifetime!=0)]
null_types = ['real', 'null_targets', 'null_genetic']
dim_range = np.unique(dims.dim)
cum_freq = {
    null_type: []
    for null_type in null_types
}
for null_type in null_types:
    print(null_type)
    for i, dim in enumerate(dim_range):
        cum_freq[null_type].append(
            np.sum(
                dims[dims.type==null_type].dim < dim
            ) / len(dims[dims.type==null_type].index)
        )

In [None]:
fig = go.Figure()
for null_type in null_types:
    fig.add_trace(
        go.Scatter(
            x=dim_range,
            y=cum_freq[null_type],
            name=null_type,
            mode='lines'
        )
    )
fig.update_layout(
    width=400, height=400,
    template='plotly_white',
    yaxis={'title': 'cumulative frequency'},
    xaxis={'title': 'dim'},
    legend={'x':.5, 'y':.2}
)
fig.show()
fig.write_image(os.path.join(path_fig, path_plot, 'cum_freq_dim.pdf'))

# Lifetime vs Cavity volume

## Compute cavity volume

Useful resource
* [Computational topology](https://books.google.com/books?id=MDXa6gFRZuIC&printsec=frontcover#v=onepage&q=%22persistent%20homology%22&f=true)
* [tutorial](http://pages.cs.wisc.edu/~jerryzhu/pub/cvrghomology.pdf)

In [None]:
import pickle
import numpy as np
import gensim.utils as gu
import gensim.matutils as gmat
import sklearn.metrics.pairwise as smp

In [None]:
for i, row in barcodes.iterrows():
    sys.stdout.write("\rindex: " + str(i+1) + '/' + str(len(barcodes.index)))
    sys.stdout.flush()
    nodes = row['homology nodes']
    topic = row['topic']
    null_index = int(row['null'])
    if row['type'] == 'real':
        network = networks[topic]
    elif row['type'] == 'null_targets':
        network = null_targets[topic][null_index]
    else:
        network = null_years[topic][null_index]
    tfidf = network.graph.graph['tfidf']
    indices = [network.nodes.index(n) for n in nodes]
    centroid = tfidf[:,indices].mean(axis=1) if indices else 0
    distances = smp.cosine_distances(X=tfidf[:,indices].transpose(), Y=centroid.transpose())\
                if indices else [0]
    barcodes.loc[i,'average distance'] = np.mean(distances)

In [None]:
barcodes

## Save

In [None]:
import pickle
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
pickle.dump(barcodes, open(f"{path_analysis}/barcode_volume.pickle",'wb'))

## Load

In [None]:
import pickle
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
barcodes = pickle.load(open(f"{path_analysis}/barcode_volume.pickle",'rb'))

In [None]:
barcodes

## Compute regression

In [None]:
reg = pd.DataFrame()
for topic in pd.unique(barcodes.topic):
    data = barcodes[(barcodes.topic==topic) & (barcodes.type=='real') & 
                    (barcodes.lifetime!=np.inf) & (barcodes.lifetime!=0)]
    x = data['average distance'].values
    y = data['lifetime'].values
    r, p = sp.stats.pearsonr(x, y)
    reg = pd.concat([reg, pd.DataFrame([[topic,r,p,data.shape[0]]],
                                       columns=['topic','r','p','n'])],
                    ignore_index=True)

In [None]:
reg[reg.n>200]

In [None]:
reg[reg.p<0.01]

## Plot

In [None]:
import scipy as sp
import scipy.stats

In [None]:
path_plot = '2 cavity volume'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
for topic in pd.unique(barcodes.topic):
    data = barcodes[(barcodes.topic==topic) & (barcodes.type=='real') & 
                    (barcodes.lifetime!=np.inf) & (barcodes.lifetime!=0)]
    x = data['average distance'].values
    y = data['lifetime'].values
    r, p = sp.stats.pearsonr(x, y)
    a, b, _, _, _ = sp.stats.linregress(x, y)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y,
                             mode='markers',
                             marker={'size': 4},
                             showlegend=False))
    fig.add_trace(go.Scatter(x=np.linspace(np.min(x),np.max(x)),
                             y=a*np.linspace(np.min(x),np.max(x))+b,
                             mode='lines',
                             showlegend=False))
    fig.update_layout(template='plotly_white',
                      title_text=f"{topic} (r={r:.2f}, p={p:0.1e})",
                      xaxis={'title': 'average distance to centroid'},
                      yaxis={'title': 'lifetime'})
    fig.show()
    fig.write_image(os.path.join(path_fig, path_plot, f"{topic}.pdf"))

# Lifetime vs Cavity weights

## Compute

In [None]:
import sys
for i, row in barcodes.iterrows():
    sys.stdout.write("\rindex: " + str(i+1) + '/' + str(len(barcodes.index)))
    sys.stdout.flush()
    nodes = row['homology nodes']
    topic = row['topic']
    null_index = int(row['null'])
    if row['type'] == 'real':
        network = networks[topic]
    elif row['type'] == 'null_targets':
        network = null_targets[topic][null_index]
    else:
        network = null_years[topic][null_index]
    subgraph = network.graph.subgraph(nodes)
    barcodes.loc[i,'mean edge weights'] = np.mean([subgraph.edges[u,v]['weight']
                                                   for u,v in subgraph.edges])

In [None]:
barcodes

## Save

In [None]:
import pickle
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
pickle.dump(barcodes, open(f"{path_analysis}/barcode_volume_weights.pickle",'wb'))

## Load

In [None]:
import pickle
path_analysis = '/Users/harangju/Developer/data/wiki/analysis/'
barcodes = pickle.load(open(f"{path_analysis}/barcode_volume_weights.pickle",'rb'))

## Compute regression

In [None]:
import scipy as sp
import scipy.stats

for topic in ['anatomy']:
    data = barcodes[(barcodes.topic==topic) & (barcodes.type=='real') & 
                    (barcodes.lifetime!=np.inf) & (barcodes.lifetime!=0)].dropna()
    x = data['mean edge weights'].values
    y = data['lifetime'].values
    r, p = sp.stats.pearsonr(x, y) if len(data['lifetime'])>2 else (0,0)

## Plot

In [None]:
path_plot = '2 cavity weights'

if not os.path.exists(os.path.join(path_fig, path_plot)):
    os.mkdir(os.path.join(path_fig, path_plot))

In [None]:
import scipy as sp
import scipy.stats

for topic in pd.unique(barcodes.topic):
    data = barcodes[(barcodes.topic==topic) & (barcodes.type=='real') & 
                    (barcodes.lifetime!=np.inf) & (barcodes.lifetime!=0)].dropna()
    x = data['mean edge weights'].values
    y = data['lifetime'].values
    r, p = sp.stats.pearsonr(x, y)
    a, b, _, _, _ = sp.stats.linregress(x, y)
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=x, y=y,
                             mode='markers',
                             marker={'size': 4},
                             showlegend=False))
    fig.add_trace(go.Scatter(x=np.linspace(np.min(x),np.max(x)),
                             y=a*np.linspace(np.min(x),np.max(x))+b,
                             mode='lines',
                             showlegend=False))
    fig.update_layout(template='plotly_white',
                      title_text=f"{topic} (r={r:.2f}, p={p:0.1e})",
                      xaxis={'title': 'average weights'},
                      yaxis={'title': 'lifetime'})
    fig.show()
    fig.write_image(os.path.join(path_fig, path_plot, f"{topic}.pdf"))