In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
from os.path import expanduser
home = expanduser("~")

import matplotlib
import scipy
import time

import tn2v

In [None]:
def circles(c=1,circle_density=20,mr=None,noise=0.05):

    x = []
    y = []

    circles = c
    meta_angles = np.linspace(0,2*np.pi,circles+1)[:circles]

    if mr is None:
        meta_radius = 0.51 + .08*(c-4)
    else:
        meta_radius = mr
    inner_radius = 0.20

    for i in range(circles):
        angle_noise = np.random.normal(loc=0,scale=2*np.pi/circle_density*noise,size=circle_density)
        inner_angles = np.linspace(0,2*np.pi,circle_density+1)[:circle_density] + angle_noise

        radius_noise = np.random.normal(loc=0,scale=inner_radius*noise,size=circle_density)
        inner_radii = np.full(circle_density,inner_radius) + radius_noise
        for j in range(circle_density):
            a = meta_radius*np.cos(meta_angles[i]) + inner_radii[j]*np.cos(inner_angles[j])
            b = meta_radius*np.sin(meta_angles[i]) + inner_radii[j]*np.sin(inner_angles[j])

            x.append(a)
            y.append(b)

    return pd.DataFrame([[x[i],y[i]] for i in range(len(x))])

In [None]:
def torus(outer_rad=1.0,inner_rad=0.5,spreader_coeff=0.175,noise=0.05):

    cycle1_min_rad = outer_rad - inner_rad
    cycle1_max_rad = outer_rad + inner_rad

    cycle2_circ = np.pi*2*inner_rad
    cycle1_min_circ = np.pi*2*cycle1_min_rad
    cycle1_max_circ = np.pi*2*cycle1_max_rad

    inner_angle_count = int(cycle2_circ/spreader_coeff)
    inner_angle_increment = 2*np.pi/inner_angle_count

    _inner_angles = np.linspace(0,2*np.pi,inner_angle_count)

    _outer_angles_dict = {}
    for i in range(len(_inner_angles)):
        angle = _inner_angles[i]
        circ = (outer_rad + np.cos(angle)*inner_rad)*2*np.pi
        outer_angle_count = int(circ/spreader_coeff)
        _outer_angles_dict[i] = np.linspace(0,2*np.pi,outer_angle_count)

    x = []
    y = []
    z = []

    for i in range(len(_inner_angles)):
        inner_angle = _inner_angles[i]
        for outer_angle in _outer_angles_dict[i]:
            x.append((outer_rad+inner_rad*np.cos(inner_angle))*np.cos(outer_angle))
            y.append((outer_rad+inner_rad*np.cos(inner_angle))*np.sin(outer_angle))
            z.append(inner_rad*np.sin(inner_angle))
    
    D = pd.DataFrame([x,y,z]).transpose()
    E = pd.DataFrame(np.random.uniform(-noise,noise,D.shape[0]*3).reshape(D.shape[0],3))
    
    return D+E

In [None]:
%%capture

main_directory = home+'/tn2v_output/'
# local directory for saving output

if not os.path.isdir(main_directory):
    os.mkdir(main_directory)

    

project_name = 's1x8_example/'
# subfolder for this project

be_careful = False
# True: if there is already a project directory with this project_name, stop
# False: you're running the same experiment again and just want to overwrite



embedding_dimension = 2 # desired embedding dimension

LEN = 5000

eta_array = np.linspace(0.005,0.0005,LEN+1)
# gradually decreasing the step size allows for more finesse in later epochs

for i in range(100):
    eta_array[i] = 1.0
# note the boolean statement in the definition of L1_array below which pairs with this;
# we find it helpful to allow an initial period with large step size and no topological loss function (so, n2v only)
# this arranges the initially random embedding into a more accurate shape before continuing with the full learning process

lambda0 = 1 # for node2vec (should be MUCH smaller than lambda1, lambda2)
lambda1 = 192 # for dim1 homology
lambda2 = 0 # for dim2 homology

L0_array = [lambda0 for i in range(LEN+1)]
L1_array = [lambda1*int(i > 100) for i in range(LEN+1)]
L2_array = [lambda2 for i in range(LEN+1)]



#### GENERATE POINTCLOUD INPUT DATA
    
# circle number
cn = 8

# circle density
cd = 16

data = circles(cn,cd) # input data, should be a pd.DataFrame



#### SETTING INPUT DATA

# any non-empty subset of the following three pieces of information can be provided;
# if distance_matrx or correlation_matrix are not assigned, they will be generated automatically.
# while you can start from a pointcloud, no pointcloud is required to run this code.

# if you are starting from a graph (as is the original purpose of Node2vec), set correlation_matrix equal to the graph's adjacency matrix

data_pointcloud=data
data_distance_matrix=None
data_correlation_matrix=None



#### NODE2VEC NBHD GENERATION PARAMETERS

r = 1000 # number of walks generated from each vertex in node2vec
l = 1000 # length of walks generated from each vertex in node2vec

# if r*l > size of data set, no nbhd will be generated, and instead the full correlation vector for each node will be used
# (https://arxiv.org/pdf/2309.08241.pdf, Remark 1)

nbhd_regen = None

# if generating random walks instead of using the above case, you can select how often to regenerate the nbhds;
# set this to 1 unless you want to experiment with static neighborhoods

# if nbhd_regen is None, we enter the same case as above (use full correlation vectors)

L_array = [l for i in range(LEN+1)]
R_array = [r for i in range(LEN+1)]
P_array = [0 for i in range(LEN+1)]
Q_array = [1 for i in range(LEN+1)]

param_array = [{'l':L_array[i],
                'r':R_array[i],
                'p':P_array[i],
                'q':Q_array[i]}
               for i in range(LEN+1)]



#### FURTHER ADJUSTMENT

mbs_array = [int(data.shape[0]*0.25) for i in range(LEN+1)] # mini-batch size — input is in number of data points, adjust the multiplier to choose by percent

lift_array = [0 for i in range(LEN+1)] # if you want to lift the PDs before matching, assign here; value is a multiplier on the DIAMETER of the target PD (should only ever need a value in [0.0, 1.0])

target_pd = None # if you want to provide an artificial target PD, assign it here (as a pd.DataFrame)
W1_data = None # if you want to start with a non-random initial state for W1 (the embedding), assign it here



#### OUTPUT SETTINGS

pointcloud_vf_save = np.arange(LEN+1,step=100) # at what epochs do you want to save .png files of the ongoing embedding with vector fields denoting gradient movement (in 2D)?

pointcloud_data_save = np.arange(LEN+1,step=100) # at what epochs do you want to save .csv files of the current embedding?

# it is highly recommended to print often when running a new experiment for quick feedback on hyperparameter adjustment



if os.path.isdir(main_directory+project_name) and be_careful:
    
    print('This project directory already exists.')

else:

    X = tn2v.tn2v(
        main_directory=main_directory,
        project_name=project_name,
        embedding_dimension=embedding_dimension,
        n2v_param_array=param_array,
        l0_array=L0_array,
        l1_array=L1_array,
        l2_array=L2_array,
        eta_array=eta_array,
        LEN=LEN,
        cpu_gpu='gpu',
        nbhd_regen=nbhd_regen,
        data_pointcloud=data_pointcloud,
        data_distance_matrix=data_distance_matrix,
        data_correlation_matrix=data_correlation_matrix,
        mbs_array=mbs_array,
        lift_array=lift_array,
        target_pd=target_pd,
        initial_W1=None,
        pointcloud_data_save=pointcloud_data_save,
        pointcloud_vf_save=pointcloud_vf_save,
        reciprocal_gamma=0.001,
        reciprocal_nu=1.0
    )

In [None]:
%%capture

main_directory = home+'/tn2v_output/'
# local directory for saving output

if not os.path.isdir(main_directory):
    os.mkdir(main_directory)

    

project_name = 'torus_example/'
# subfolder for this project

be_careful = False
# True: if there is already a project directory with this project_name, stop
# False: you're running the same experiment again and just want to overwrite



embedding_dimension = 3 # desired embedding dimension

LEN = 40000

eta_array = np.linspace(0.0025,0.000125,LEN+1)
# gradually decreasing the step size allows for more finesse in later epochs

for i in range(100):
    eta_array[i] = 1.0
# note the boolean statement in the definition of L1_array below which pairs with this;
# we find it helpful to allow an initial period with large step size and no topological loss function (so, n2v only)
# this arranges the initially random embedding into a more accurate shape before continuing with the full learning process

lambda0 = 1 # for node2vec
lambda1 = 256 # for dim1 homology
lambda2 = 784 # for dim2 homology

L0_array = [lambda0 for i in range(LEN+1)]
L1_array = [lambda1*float(LEN/4+(3*i/4))/float(LEN)*int(i>100) for i in range(LEN+1)]
L2_array = [lambda2*float(LEN/4+(3*i/4))/float(LEN)*int(i>100) for i in range(LEN+1)]
# we ramp up the lambda1, lambda2 values over time just to avoid over-aggresive movements in the beginning when eta is larger



#### GENERATE POINTCLOUD INPUT DATA
    
data = torus()



#### SETTING INPUT DATA

# any non-empty subset of the following three pieces of information can be provided;
# if distance_matrx or correlation_matrix are not assigned, they will be generated automatically.
# while you can start from a pointcloud, no pointcloud is required to run this code.

# if you are starting from a graph (as is the original purpose of Node2vec), set correlation_matrix equal to the graph's adjacency matrix

data_pointcloud=data
data_distance_matrix=None
data_correlation_matrix=None



#### NODE2VEC NBHD GENERATION PARAMETERS

r = 1000 # number of walks generated from each vertex in node2vec
l = 1000 # length of walks generated from each vertex in node2vec

# if r*l > size of data set, no nbhd will be generated, and instead the full correlation vector for each node will be used
# (https://arxiv.org/pdf/2309.08241.pdf, Remark 1)

nbhd_regen = None

# if generating random walks instead of using the above case, you can select how often to regenerate the nbhds;
# set this to 1 unless you want to experiment with static neighborhoods

# if nbhd_regen is None, we enter the same case as above (use full correlation vectors)

L_array = [l for i in range(LEN+1)]
R_array = [r for i in range(LEN+1)]
P_array = [0 for i in range(LEN+1)]
Q_array = [1 for i in range(LEN+1)]

param_array = [{'l':L_array[i],
                'r':R_array[i],
                'p':P_array[i],
                'q':Q_array[i]}
               for i in range(LEN+1)]



#### FURTHER ADJUSTMENT

mbs_array = [int(data.shape[0]*0.0625) for i in range(LEN+1)] # mini-batch size — input is in number of data points, adjust the multiplier to choose by percent

lift_array = [0.75 for i in range(LEN+1)] # if you want to lift the PDs before matching, assign here; value is a multiplier on the DIAMETER of the target PD (should only ever need a value in [0.0, 1.0])

target_pd = None # if you want to provide an artificial target PD, assign it here (as a pd.DataFrame)
W1_data = None # if you want to start with a non-random initial state for W1 (the embedding), assign it here



#### OUTPUT SETTINGS

pointcloud_vf_save = np.arange(LEN+1,step=100) # at what epochs do you want to save .png files of the ongoing embedding with vector fields denoting gradient movement (in 2D)?

pointcloud_data_save = np.arange(LEN+1,step=100) # at what epochs do you want to save .csv files of the current embedding?

# it is highly recommended to print often when running a new experiment for quick feedback on hyperparameter adjustment



if os.path.isdir(main_directory+project_name) and be_careful:
    
    print('This project directory already exists.')

else:

    X = tn2v.tn2v(
        main_directory=main_directory,
        project_name=project_name,
        embedding_dimension=embedding_dimension,
        n2v_param_array=param_array,
        l0_array=L0_array,
        l1_array=L1_array,
        l2_array=L2_array,
        eta_array=eta_array,
        LEN=LEN,
        cpu_gpu='gpu',
        nbhd_regen=nbhd_regen,
        data_pointcloud=data_pointcloud,
        data_distance_matrix=data_distance_matrix,
        data_correlation_matrix=data_correlation_matrix,
        mbs_array=mbs_array,
        lift_array=lift_array,
        target_pd=target_pd,
        initial_W1=None,
        pointcloud_data_save=pointcloud_data_save,
        pointcloud_vf_save=pointcloud_vf_save,
        reciprocal_gamma=0.001,
        reciprocal_nu=0.8 # the torus has a tendency to show extreme saddle distortion in the n2v loss function, likely to do the hyperbolic reciprocal process for making the correlation matrix; we lower the exponent slightly to mitigate this
    )