### Persistent homology examples

* Ripser [paper](https://www.theoj.org/joss-papers/joss.00925/10.21105.joss.00925.pdf) [code](https://github.com/scikit-tda/ripser.py) (fast)
* Dionysus 2 [code](https://mrzv.org/software/dionysus2/) (representative examples)
* Nico's [code](https://github.com/nhchristianson/Math-text-semantic-networks)
* Ann's [code](https://github.com/asizemore/PH_tutorial/blob/master/Tutorial_day1.ipynb)

In [None]:
%load_ext autoreload
%autoreload 2
import os,sys
sys.path.insert(1, os.path.join(sys.path[0], '..', 'module'))

### Persistent homology for all topics

In [None]:
topics = ['anatomy', 'biochemistry', 'cognitive science', 'evolutionary biology',
          'genetics', 'immunology', 'molecular biology', 'chemistry', 'biophysics',
          'energy', 'optics', 'earth science', 'geology', 'meteorology',
          'philosophy of language', 'philosophy of law', 'philosophy of mind',
          'philosophy of science', 'economics', 'accounting', 'education',
          'linguistics', 'law', 'psychology', 'sociology', 'electronics',
          'software engineering', 'robotics']#, 'physics', 'mathematics']

In [None]:
import wiki

path_saved = '/Users/harangju/Developer/data/wiki/graphs/dated/'

networks = {}
for topic in topics:
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.pickle')
    networks[topic].load_barcodes(path_saved + topic + '.barcode')

In [None]:
path_null = '/Users/harangju/Developer/data/wiki/graphs/null-target/'
num_nulls = 2
null_targets = {}
for topic in topics:
    null_targets[topic] = []
    for i in range(num_nulls):
        network = wiki.Net()
        network.load_graph(path_null + topic + '-null-' + str(i) + '.pickle')
        network.load_barcodes(path_null + topic + '-null-' + str(i) + '.barcode')
        null_targets[topic].append(network)

In [None]:
path_null = '/Users/harangju/Developer/data/wiki/graphs/null-year/'
num_nulls = 2
null_years = {}
for topic in topics:
    null_years[topic] = []
    for i in range(num_nulls):
        network = wiki.Net()
        network.load_graph(path_null + topic + '-null-' + str(i) + '.pickle')
        network.load_barcodes(path_null + topic + '-null-' + str(i) + '.barcode')
        null_years[topic].append(network)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib._color_data as mcd

sns.set(style='white', font_scale=2)
def plot_barcodes(barcodes):
    """
    """
    colors = [mcd.XKCD_COLORS['xkcd:'+c]
          for c in ['emerald green', 'tealish', 'peacock blue', 
                    'grey', 'brown', 'red', 'yellow']]
    plt.figure(figsize=(18,6))
    for i, row in barcodes.iterrows():
        birth = row['birth']
        death = row['death']
        x = [birth, 2050] if death==np.inf else\
            [birth, death]
        plt.plot(x, i*np.ones(len(x)), colors[row['dim']])
        if death != np.inf:
            plt.plot(death, i, 'r.')
    plt.axvline(x=2040, linestyle='--', color=mcd.XKCD_COLORS['xkcd:grey'])
    plt.gca().axes.yaxis.set_ticklabels([])

In [None]:
def plot_persistence_diagram(barcodes):
    colors = [mcd.XKCD_COLORS['xkcd:'+c]
      for c in ['emerald green', 'tealish', 'peacock blue', 
                'grey', 'brown', 'red', 'yellow']]
    plt.figure(figsize=(10,10))
    for dim in set(barcodes['dim']):
        data = barcodes.loc[barcodes['dim']==dim]
        data.loc[data['death']==np.inf,'death'] = 2030
        plt.plot(data['birth'], data['death'], '.')
    x = [barcodes['birth'].min(),
         barcodes.loc[barcodes['death']!=np.inf,'death'].max()]
    print(x)
    plt.plot(x, [2030, 2030], '--')

In [None]:
pd.options.display.max_rows = 12
networks['biochemistry'].barcodes

In [None]:
for topic in topics:
    print('Topic: ' + topic)
    plot_barcodes(networks[topic].barcodes)
    plt.title(topic)
    plt.show()
    plot_barcodes(null_targets[topic][0].barcodes)
    plt.title('null-target')
    plot_barcodes(null_years[topic][0].barcodes)
    plt.title('null-year')
    plt.show()
#     plt.savefig(path_saved + topic + '.png')

### Lifetimes vs nulls

In [None]:
networks[topic].barcodes

In [None]:
max_dim = max([max(networks[topic].barcodes.dim.values) for topic in topics])
barcodes = pd.DataFrame()
for topic, network in networks.items():
    data = network.barcodes.copy()
    data['topic'] = topic
    data['type'] = 'real'
    data['null'] = -1
    barcodes = pd.concat([barcodes] + [data], ignore_index=True)
for topic, nulls in null_targets.items():
    for i, network in enumerate(nulls):
        data = network.barcodes.copy()
        data['topic'] = topic
        data['type'] = 'null_targets'
        data['null'] = i
        barcodes = pd.concat([barcodes] + [data], ignore_index=True)
barcodes['count'] = 1
barcodes = barcodes.merge(barcodes.groupby(['type','topic','dim'])['count'].sum(),
                          on=['type','topic','dim'],
                          suffixes=('','_dim'))\
                   .drop('count', axis=1)

In [None]:
barcodes['lifetime'] = barcodes['death'] - barcodes['birth']

In [None]:
barcodes

In [None]:
from scipy import stats

In [None]:
for topic in topics:
    data = barcodes[barcodes.topic==topic].copy()
    data = data[data.lifetime!=np.inf]
    t, p = stats.ttest_ind(data[data.type=='real']['lifetime'].values,
                           data[data.type=='null_targets']['lifetime'].values)
    print(topic, '\n\t', 't =', t, '\tp =', p)

In [None]:
plt.figure(figsize=(20,6))
data = barcodes.copy()
data = data[data.lifetime!=np.inf]
ax = sns.violinplot(x='topic', y='lifetime', hue='type', data=data, split=True)
# ax.set(yscale='log')
plt.xticks(np.arange(len(topics)), topics, rotation='vertical');

In [None]:
plt.figure(figsize=(20,6))
data = barcodes.copy()
data.loc[data.lifetime==np.inf,'lifetime'] = max(data[data.lifetime!=np.inf].lifetime.values)
sns.violinplot(x='topic', y='lifetime', hue='type', data=data, split=True)
plt.xticks(np.arange(len(topics)), topics, rotation='vertical');

In [None]:
# f, axes = plt.subplots(int(len(topics)/2), 2, figsize=(6,6), sharex=True)
# sns.despine(left=True)
for i, topic in enumerate(topics):
    plt.figure(figsize=(20,4))
    lifetimes = networks[topic].barcodes.death.values - networks[topic].barcodes.birth.values
    sns.distplot([x if x!=np.inf else max(lifetimes[lifetimes!=np.inf]) for x in lifetimes],
                 hist=True, rug=True, label='real')#, ax=axes[int(i/2),i%2])
    lifetimes = null_targets[topic][0].barcodes.death.values - null_targets[topic][0].barcodes.birth.values
    sns.distplot([x if x!=np.inf else max(lifetimes[lifetimes!=np.inf]) for x in lifetimes],
                 hist=True, rug=True, label='null-target')#, ax=axes[int(i/2),i%2])
#     lifetimes = null_years[topic][0].barcodes.death.values - null_years[topic][0].barcodes.birth.values
#     sns.distplot([x if x!=np.inf else 10000 for x in lifetimes],
#                  hist=False, rug=True, label='null-year')
#     axes[int(i/2),i%2].set_title(topic)
#     axes[int(i/2),i%2].legend()
    plt.title(topic)
    plt.legend()
# plt.tight_layout()

### Dimensions vs nulls

In [None]:
plt.figure(figsize=(20,6))
sns.violinplot(x='dim', y='count_dim', hue='type', data=barcodes, split=True)
# sns.scatterplot(x='dim', y='count_dim', hue='type', data=barcodes)

### Simplex tightness

In [None]:
mean_weights = []
for i in range(len(barcodes.index)):
    death_simplex = barcodes.iloc[i]['death simplex']
    topic = barcodes.iloc[i]['topic']
    network_type = barcodes.iloc[i]['type']
    pairs = [(n1,n2) for n2 in death_simplex
                     for n1 in death_simplex if n1!=n2]
    if network_type=='real':
        edges = [networks[topic].graph.get_edge_data(n1,n2) for n1,n2 in pairs]
    elif network_type=='null_targets':
        null = barcodes.iloc[i]['null']
        edges = [null_targets[topic][null].graph.get_edge_data(n1,n2) for n1,n2 in pairs]
    mean_weight = np.mean([e['weight'] for e in edges if e]) if edges else 0
    mean_weights.append(mean_weight)

In [None]:
barcodes['mean weights'] = mean_weights
barcodes

In [None]:
plt.figure(figsize=(10,10))
data = barcodes.copy()
# data.loc[data.lifetime==np.inf,'lifetime'] = max(data[data.lifetime!=np.inf].lifetime.values)
data = data[data.lifetime!=np.inf]
ax = sns.regplot(x='mean weights', y='lifetime',
                 data=data, marker='.')
a, b, r, p, s = stats.linregress(data['mean weights'], data['lifetime'])
plt.title('r={:.4f}, p={:.4f}'.format(r, p))

In [None]:
for topic in topics:
    plt.figure(figsize=(6,6))
    data = barcodes[barcodes.topic==topic].copy()
#     data.loc[data.lifetime==np.inf,'lifetime'] = max(data[data.lifetime!=np.inf].lifetime.values)
    data = data[data.lifetime!=np.inf]
    sns.regplot(x='mean weights', y='lifetime', data=data,
                marker='+', fit_reg=True)
    a, b, r, p, s = stats.linregress(data['mean weights'], data['lifetime'])
    plt.title('{}\nr={:.4f}, p={:.4f}'.format(topic, r, p))

### Node participation in birth & deaths

### Identify important nodes

### Cavity volume