### Persistent homology examples

* Ripser [paper](https://www.theoj.org/joss-papers/joss.00925/10.21105.joss.00925.pdf) [code](https://github.com/scikit-tda/ripser.py) (fast)
* Dionysus 2 [code](https://mrzv.org/software/dionysus2/) (representative examples)
* Nico's [code](https://github.com/nhchristianson/Math-text-semantic-networks)
* Ann's [code](https://github.com/asizemore/PH_tutorial/blob/master/Tutorial_day1.ipynb)

In [1]:
%load_ext autoreload
%autoreload 2

### Load graphs

In [22]:
topics = ['biochemistry']
path_saved = '/Users/harangju/Box Sync/Research/my papers/wikipedia paper/data/graphs/full/'

In [23]:
import wiki

networks = {}
for topic in topics:
    networks[topic] = wiki.Net()
    networks[topic].load_graph(path_saved + topic + '.gexf')
networks

{'biochemistry': <wiki.Net at 0x1173d3950>}

### Try persistent homology

In [24]:
[networks[topic].graph.nodes[n] for n in networks[topic].graph.nodes]

[{'year': -360, 'label': 'Chemical element'},
 {'year': -300, 'label': 'Chemistry'},
 {'year': 1916, 'label': 'Chemical compound'},
 {'year': 1800, 'label': 'Atom'},
 {'year': 1700, 'label': 'Molecule'},
 {'year': 1500, 'label': 'Chemical reaction'},
 {'year': 1735, 'label': 'Biology'},
 {'year': 1600, 'label': 'Pharmacology'},
 {'year': 1100, 'label': 'Chemical bond'},
 {'year': 2030, 'label': 'Ionic bond'},
 {'year': 1700, 'label': 'Electron'},
 {'year': 1912, 'label': 'Hydrogen bond'},
 {'year': 2030, 'label': 'Van der waals force'},
 {'year': 1900, 'label': 'Prokaryote'},
 {'year': 1905, 'label': 'Organism'},
 {'year': 1600, 'label': 'Life'},
 {'year': 698, 'label': 'Taxonomy (biology)'},
 {'year': 2030, 'label': 'Fungi'},
 {'year': 1676, 'label': 'Bacteria'},
 {'year': 1905, 'label': 'Archaea'},
 {'year': 1800, 'label': 'Developmental biology'},
 {'year': 1300, 'label': 'Organ (anatomy)'},
 {'year': 1905, 'label': 'Eukaryote'},
 {'year': 1804, 'label': 'Cell nucleus'},
 {'year': 1

In [28]:
f = networks[topic].filtration
m = networks[topic].persistence
for i in range(len(m)):
    if m.pair(i) < i: continue      # skip negative simplices
    dim = f[i].dimension()
    birth = int(f[i].data)
    simplex = [networks[topic].nodes[s] for s in f[i]]
    if m.pair(i) != m.unpaired:
        death = int(f[m.pair(i)].data)
        death_simplex = [networks[topic].nodes[s] for s in f[m.pair(i)]]
        death_nodes = [n for n in death_simplex
                       if death==networks[topic].graph.nodes[n]['year']]
        print('dim={}, ({}, {}) \t{}---{} in {}'.format(dim, birth, death, simplex, 
                                                        death_nodes, death_simplex))
    else:
        print('dim={}, ({}, _) \t{}--'.format(dim, birth, simplex))

dim=0, (-2500, _) 	['Carbon']--
dim=0, (-2500, 800) 	['Nutrition']---['Ethanol'] in ['Ethanol', 'Ammonia']
dim=0, (-2000, -2000) 	['Alcohol']---['Alcohol'] in ['Carbon', 'Alcohol']
dim=0, (-2000, 1600) 	['Medicine']---['Life'] in ['Life', 'Microbiology']
dim=0, (-2000, -2000) 	['Pharmacist']---['Medicine', 'Pharmacist'] in ['Medicine', 'Pharmacist']
dim=0, (-2000, 1669) 	['Pesticide']---['Phosphorus'] in ['Phosphorus', 'Pesticide']
dim=0, (-700, 800) 	['Sugar']---['Ethanol'] in ['Sugar', 'Ethanol']
dim=0, (-500, -360) 	['Sulfur']---['Chemical element'] in ['Chemical element', 'Sulfur']
dim=0, (-500, 1599) 	['Microbiology']---['Virology'] in ['Medicine', 'Virology']
dim=0, (-500, 1600) 	['Phylogenetics']---['Life'] in ['Life', 'Physiology']
dim=0, (-430, 1600) 	['Immune system']---['Evolution'] in ['Immune system', 'Evolution']
dim=0, (-430, -430) 	['T cell']---['Immune system', 'T cell'] in ['Immune system', 'T cell']
dim=0, (-400, -300) 	['Physiology']---['Chemistry'] in ['Chemistry',

dim=1, (1905, 1953) 	['Archaea', 'Hydrogen']---['Photosynthesis', 'Chemiosmosis'] in ['Adenosine triphosphate', 'Photosynthesis', 'Chemiosmosis']
dim=1, (1905, 1905) 	['Archaea', 'Cell membrane']---['Archaea'] in ['Prokaryote', 'Archaea', 'Cell membrane']
dim=1, (1905, 1905) 	['Archaea', 'Ammonia']---['Archaea'] in ['Archaea', 'Hydrogen', 'Ammonia']
dim=1, (1905, 1905) 	['Archaea', 'Biotechnology']---['Archaea'] in ['Bacteria', 'Archaea', 'Biotechnology']
dim=1, (1905, 1905) 	['Archaea', 'Microbiology']---['Archaea'] in ['Prokaryote', 'Archaea', 'Microbiology']
dim=1, (1905, 1905) 	['Archaea', 'Genetics']---['Organism', 'Archaea'] in ['Organism', 'Archaea', 'Genetics']
dim=1, (1905, 1905) 	['Archaea', 'Genetic recombination']---['Archaea', 'Genetic recombination'] in ['Bacteria', 'Archaea', 'Genetic recombination']
dim=1, (1905, 1905) 	['Archaea', 'Flagellum']---['Archaea', 'Flagellum'] in ['Bacteria', 'Archaea', 'Flagellum']
dim=1, (1905, 1905) 	['Developmental biology', 'Phenotype']-

dim=1, (1916, 1916) 	['Water', 'Solvent']---['Water'] in ['Hydrophobe', 'Water', 'Solvent']
dim=1, (1916, _) 	['Water', 'Lactic acid']--
dim=1, (1916, 1916) 	['Water', 'Osmosis']---['Water'] in ['Solution', 'Water', 'Osmosis']
dim=1, (1916, 1916) 	['Water', 'Colloid']---['Water', 'Colloid'] in ['Solution', 'Water', 'Colloid']
dim=1, (1916, 1916) 	['Water', 'Steroid 17alpha-monooxygenase']---['Water', 'Steroid 17alpha-monooxygenase'] in ['Oxygen', 'Water', 'Steroid 17alpha-monooxygenase']
dim=1, (1916, 1916) 	['Chirality (chemistry)', 'Inorganic chemistry']---['Inorganic chemistry'] in ['Organic chemistry', 'Chirality (chemistry)', 'Inorganic chemistry']
dim=1, (1916, _) 	['Glycoprotein', 'Cytosol']--
dim=1, (1916, _) 	['Glycoprotein', 'Erythropoietin']--
dim=1, (1916, 1916) 	['Glycoprotein', 'Endoplasmic reticulum']---['Glycoprotein'] in ['Glycoprotein', 'Cytosol', 'Endoplasmic reticulum']
dim=1, (1916, _) 	['Glycoprotein', 'Selectin']--
dim=1, (1916, _) 	['Glycoprotein', 'Vitronectin'

dim=1, (1930, 1930) 	['Genome', 'Genetics']---['Genome'] in ['Gene', 'Genome', 'Genetics']
dim=1, (1930, 1930) 	['Genome', 'Genomics']---['Genome'] in ['Genome', 'Genetics', 'Genomics']
dim=1, (1930, 1950) 	['Genome', 'Proteomics']---['Bioinformatics'] in ['Bioinformatics', 'Genomics', 'Proteomics']
dim=1, (1930, 1930) 	['Genome', 'Locus (genetics)']---['Genome', 'Locus (genetics)'] in ['Gene', 'Genome', 'Locus (genetics)']
dim=1, (1930, 2030) 	['Genome', 'Molecular evolution']---['Dna'] in ['Dna', 'Genome', 'Molecular evolution']
dim=1, (1930, 1930) 	['Genome', "Muller's ratchet"]---['Genome', "Muller's ratchet"] in ['Mutation', 'Genome', "Muller's ratchet"]
dim=1, (1930, 1930) 	['Chromosome', 'Bcl-2']---['Bcl-2'] in ['Apoptosis', 'Chromosome', 'Bcl-2']
dim=1, (1930, 1930) 	['Chromosome', 'Locus (genetics)']---['Genome', 'Locus (genetics)'] in ['Genome', 'Chromosome', 'Locus (genetics)']
dim=1, (1930, 1930) 	['Genetics', 'Locus (genetics)']---['Locus (genetics)'] in ['Gene', 'Genetics

dim=3, (1974, 1974) 	['Organelle', 'Mitochondrion', 'Chloroplast', 'Endomembrane system']---['Endomembrane system'] in ['Eukaryote', 'Organelle', 'Mitochondrion', 'Chloroplast', 'Endomembrane system']
dim=0, (1976, 1976) 	['Polymerase chain reaction']---['Polymerase chain reaction'] in ['Gene', 'Polymerase chain reaction']
dim=0, (1976, 1976) 	['Nucleic acid hybridization']---['Polymerase chain reaction', 'Nucleic acid hybridization'] in ['Polymerase chain reaction', 'Nucleic acid hybridization']
dim=1, (1976, 2030) 	['Enzyme', 'Polymerase chain reaction']---['Dna'] in ['Dna', 'Molecular biology', 'Polymerase chain reaction']
dim=1, (1976, 1976) 	['Nucleotide', 'Polymerase chain reaction']---['Polymerase chain reaction'] in ['Nucleotide', 'Polymerase chain reaction', 'Primer (molecular biology)']
dim=1, (1976, 1976) 	['Molecular biology', 'Polymerase chain reaction']---['Polymerase chain reaction'] in ['Molecular biology', 'Polymerase chain reaction', 'Primer (molecular biology)']
dim=

dim=2, (2017, 2017) 	['Peptide', 'G protein-coupled receptor', 'Hormone']---['Hormone'] in ['Protein', 'Peptide', 'G protein-coupled receptor', 'Hormone']
dim=2, (2017, 2017) 	['Peptide', 'Receptor (biochemistry)', 'Hormone']---['Hormone'] in ['Protein', 'Peptide', 'Receptor (biochemistry)', 'Hormone']
dim=2, (2017, 2017) 	['Peptide', 'Alanine', 'N-terminus']---['N-terminus'] in ['Protein', 'Peptide', 'Alanine', 'N-terminus']
dim=2, (2017, 2017) 	['Peptide', 'C-terminus', 'N-terminus']---['N-terminus'] in ['Protein', 'Peptide', 'C-terminus', 'N-terminus']
dim=2, (2017, 2017) 	['Oligopeptide', 'Alanine', 'N-terminus']---['N-terminus'] in ['Protein', 'Oligopeptide', 'Alanine', 'N-terminus']
dim=2, (2017, 2017) 	['Oligopeptide', 'Glycine', 'N-terminus']---['N-terminus'] in ['Peptide', 'Oligopeptide', 'Glycine', 'N-terminus']
dim=2, (2017, 2017) 	['Oligopeptide', 'Valine', 'N-terminus']---['N-terminus'] in ['Peptide', 'Oligopeptide', 'Valine', 'N-terminus']
dim=2, (2017, 2017) 	['Oligopept

dim=2, (2030, _) 	['Dna', 'Nucleic acid', 'Nitrogen']--
dim=2, (2030, 2030) 	['Dna', 'Genetic engineering', 'Genome']---['Dna'] in ['Gene', 'Dna', 'Genetic engineering', 'Genome']
dim=2, (2030, _) 	['Dna', 'Carbohydrate', 'Biopolymer']--
dim=2, (2030, _) 	['Dna', 'Ribose', 'Cyclic nucleotide']--
dim=2, (2030, 2030) 	['Dna', 'Biopolymer', 'Polymer']---['Dna'] in ['Dna', 'Nucleic acid', 'Biopolymer', 'Polymer']
dim=2, (2030, 2030) 	['Dna', 'Polymer', 'Structural motif']---['Dna'] in ['Dna', 'Nucleic acid', 'Polymer', 'Structural motif']
dim=2, (2030, 2030) 	['Dna', 'Base pair', 'Translation (biology)']---['Dna'] in ['Dna', 'Genetic code', 'Base pair', 'Translation (biology)']
dim=2, (2030, 2030) 	['Dna', 'Base pair', 'Genome']---['Dna'] in ['Gene', 'Dna', 'Base pair', 'Genome']
dim=2, (2030, 2030) 	['Dna', 'Base pair', 'Chromosome']---['Dna'] in ['Dna', 'Base pair', 'Genome', 'Chromosome']
dim=2, (2030, 2030) 	['Dna', 'Base pair', 'Biodiversity']---['Dna'] in ['Gene', 'Dna', 'Base pair',

#### Plotting

In [None]:
import math

def betti_curves(diagrams):
    """
    Parameters
    ----------
    diagrams: dionysus.DiagramPoint
        persistence diagram with elements
        (int betti_number, int start, int death)
    Returns
    -------
    curves: numpy array
        {int betti_number: numpy.array}
    """
    betti_numbers = [i for i, _ in enumerate(dgms)]
    birth = [pt.birth for dgm in diagrams for pt in dgm]
    death = [pt.death for dgm in diagrams for pt in dgm]
    times = sorted(list(set(birth + death)))
    max_time = times[-2]+1 if max(times)==math.inf else times[-1]
    curves = np.zeros([betti_numbers[-1]+1, int(max_time)])
    for betti, dgm in enumerate(diagrams):
        for pt in dgm:
            birth = max_time if pt.birth==math.inf else pt.birth
            death = max_time if pt.death==math.inf else pt.death
            curves[betti, int(birth):int(death)] += 1
    return curves

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.set(color_codes=True)

def plot_barcode(barcode):
    """
    Parameters
    ----------
    barcode: [(int dim, int birth, int death)]
    """
    df = pd.DataFrame(dict(time=np.arange(500),
                       value=np.random.randn(500).cumsum()))
    g = sns.relplot(x="time", y="value", kind="line", data=df)
    for bar in barcode:
        

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style='whitegrid', font_scale=2)

In [None]:
m = d.homology_persistence(f)
barcode = []
for i in range(len(m)):
    if m.pair(i) < i: continue
    dim = f[i].dimension()
    birth = int(f[i].data)
    simplex = [nodes[s] for s in f[i]]
    if m.pair(i) != m.unpaired:
        death = int(f[m.pair(i)].data)
        death_simplex = [nodes[s] for s in f[m.pair(i)]]
        death_nodes = [n for n in death_simplex
                       if death==graph.nodes[n]['year']]
        barcode.append([dim, birth, death, [birth, death],
                        simplex, death_nodes, death_simplex])
    else:
        barcode.append([dim, birth, np.inf, [birth, np.inf],
                        simplex, [], []])

In [None]:
bar_data = pd.DataFrame(barcode, columns=['dim', 'birth', 'death',
                                          'lifetime',
                                          'birth simplex',
                                          'death nodes',
                                          'death_simplex'])
bar_data = bar_data.sort_values(by=['dim', 'birth'])
bar_data

In [None]:
import matplotlib._color_data as mcd
colors = mcd.XKCD_COLORS
colors = [mcd.XKCD_COLORS['xkcd:'+c]
          for c in ['emerald green', 'tealish', 'peacock blue', 
                    'grey', 'brown', 'red', 'yellow']]
colors

In [None]:
fig = plt.figure(figsize=(15,10))
for i, row in bar_data.iterrows():
    birth = row['birth']
    death = row['death']
#     if death == np.inf or birth < 1900: continue
    if death > 1000 : continue
    x = [birth, 2020] if death==np.inf else\
        [birth, death]
    plt.plot(x, i*np.ones(len(x)), colors[row['dim']])
    if death and death != np.inf:
        plt.plot([death-1,death], [i,i],
                 mcd.XKCD_COLORS['xkcd:red'])

In [None]:
m = d.homology_persistence(f)
dgms = d.init_diagrams(m, f)
betti = betti_curves(dgms)
# print(betti.shape)
betti
data = pd.DataFrame(betti.transpose(),
                    index=years + list(range(max(years)+1,max(years)+1)),
                    columns=[r'$\beta_'+str(i)+'$' for i,_ in enumerate(dgms)])
sns.lineplot(data=data, linewidth=2)
plt.axvline(x=max(years), color='xkcd:grey', alpha=0.5, linestyle=':')
# plt.xlim([1900, 2020])
plt.title('Betti curves')
plt.xlabel('year')
plt.ylabel('Number of live cycles')