# Struggling with the computation complexity

In [1]:
import numpy as np
import pandas as pd
import ripser
import time
import ripserplusplus as rpp_py
from persim.persistent_entropy import *
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.neighbors import NearestNeighbors
import subprocess as sp
import gc

In [2]:
def get_gpu_memory():
    _output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]

    ACCEPTABLE_AVAILABLE_MEMORY = 1024
    COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
    memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
    memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
    #print("free GPU memory (mega): ",memory_free_values)
    print("free GPU memory (MiB):")
    print(memory_free_info[0])
    return memory_free_values

In [3]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

housing = fetch_california_housing()

X_all = pd.DataFrame(housing.data, columns = housing.feature_names)
y_all = housing['target']

In [4]:
X_all = X_all.iloc[:3000]
y_all = y_all[:3000]

In [5]:
scaler = StandardScaler()
scaler.fit(X_all)
X_scaled = pd.DataFrame(scaler.transform(X_all), index= X_all.index, columns= X_all.columns)
# X_final = X.iloc[:, :8].to_numpy()

X_scaled

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.642119,0.828791,0.281990,-0.123927,-0.997996,-0.331205,0.202650,-0.604776
1,2.629035,-0.649917,0.111970,-0.179877,1.296598,-0.881856,0.189239,-0.598947
2,2.055091,1.642080,0.579190,-0.070446,-0.805952,-0.026418,0.182533,-0.610606
3,1.167623,1.642080,0.016067,-0.070863,-0.737523,-0.340607,0.182533,-0.616435
4,0.179770,1.642080,0.121933,-0.062220,-0.729797,-0.793367,0.182533,-0.616435
...,...,...,...,...,...,...,...,...
2995,-0.020066,-1.389271,-0.153785,-0.096757,2.537157,-0.348624,-1.513983,1.249001
2996,0.371909,-1.389271,-0.040401,-0.196787,0.045001,0.545841,-1.520689,1.254830
2997,-0.528699,-1.315335,-0.124025,-0.096017,0.074801,-0.431730,-1.513983,1.260660
2998,-0.091424,-1.167464,-0.120654,-0.097724,0.656451,0.521781,-1.513983,1.266489


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.5, random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

RMSE: 0.5072356564753885
R-square: 0.7203783778608764


# Add Entropy featureX

### reset for the index drop

In [7]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)

### Full ripser ++

In [9]:
entropy_feature = []

start = time.time()
for i in range(X_scaled.shape[0]):
    if (i + 1) % 30 == 0:
        print(f"Processing row {i + 1} out of {X_scaled.shape[0]}")
        
        get_gpu_memory()
        gc.collect()
        
    data_remove = X_scaled.drop(index=i)
    pca_result = pca.fit_transform(data_remove)
    d = rpp_py.run("--format point-cloud --dim 1 --threshold 1",pca_result)
    dgm = np.array([(b, d) for b, d in d[0]])
    p_entropy = persistent_entropy(dgm)
    
    entropy_feature.append(p_entropy)
    
end = time.time()
print("ripser++ total time: ", end-start)

Processing row 30 out of 3000
free GPU memory (MiB):
7061 MiB
Processing row 60 out of 3000
free GPU memory (MiB):
7051 MiB
Processing row 90 out of 3000
free GPU memory (MiB):
7054 MiB
Processing row 120 out of 3000
free GPU memory (MiB):
7026 MiB
Processing row 150 out of 3000
free GPU memory (MiB):
6978 MiB
Processing row 180 out of 3000
free GPU memory (MiB):
7029 MiB
Processing row 210 out of 3000
free GPU memory (MiB):
7025 MiB
Processing row 240 out of 3000
free GPU memory (MiB):
7046 MiB
Processing row 270 out of 3000
free GPU memory (MiB):
7042 MiB
Processing row 300 out of 3000
free GPU memory (MiB):
6983 MiB
Processing row 330 out of 3000
free GPU memory (MiB):
7054 MiB
Processing row 360 out of 3000
free GPU memory (MiB):
7067 MiB
Processing row 390 out of 3000
free GPU memory (MiB):
7086 MiB
Processing row 420 out of 3000
free GPU memory (MiB):
7034 MiB
Processing row 450 out of 3000
free GPU memory (MiB):
7032 MiB
Processing row 480 out of 3000
free GPU memory (MiB):
7058

In [10]:
X_scaled['topo_feature'] = [x.tolist()[0] if isinstance(x, np.ndarray) else x for x in entropy_feature]
X_scaled['topo_feature'] = scaler.fit_transform(X_scaled[['topo_feature']])
X_scaled.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,topo_feature
0,2.642119,0.828791,0.28199,-0.123927,-0.997996,-0.331205,0.20265,-0.604776,-0.207338
1,2.629035,-0.649917,0.11197,-0.179877,1.296598,-0.881856,0.189239,-0.598947,-0.056716
2,2.055091,1.64208,0.57919,-0.070446,-0.805952,-0.026418,0.182533,-0.610606,-0.213089
3,1.167623,1.64208,0.016067,-0.070863,-0.737523,-0.340607,0.182533,-0.616435,-0.127761
4,0.17977,1.64208,0.121933,-0.06222,-0.729797,-0.793367,0.182533,-0.616435,-0.084355


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.5, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

RMSE: 0.5065670208025156
R-square: 0.7211150838116289


## Testing

In [40]:
import numpy as np
from ripser import ripser
from scipy.spatial.distance import pdist, squareform

# 模擬資料
data = np.random.rand(20000, 2)  # 100個資料點，每個資料點有2個維度

# 預計算距離矩陣
distance_matrix = squareform(pdist(data))

def compute_partial_persistence_diagram(dist_matrix, remove_index, maxdim=0):
    # 移除某個資料點，重新計算拓撲結構
    dist_matrix_remove = np.delete(distance_matrix, remove_index, axis=0)
    dist_matrix_remove = np.delete(dist_matrix_remove, remove_index, axis=1)
    
    # 計算 persistent homology (僅計算 H0)
    result = rpp_py.run("--format distance --dim 1", dist_matrix_remove)
    return result[0]  # 回傳H0的Persistence Diagram

def leave_one_out_persistence(data, distance_matrix, maxdim=0):
    pdgms = []
    n_points = data.shape[0]  # 資料點數量
    for i in range(n_points):
        if (i + 1) % 50 == 0:
            print(f"Calculating persistence diagram by leaving out point {i+1}/{n_points}...")
        pdgm = compute_partial_persistence_diagram(distance_matrix, i, maxdim=maxdim)
        pdgms.append(pdgm)
    return pdgms

In [None]:
start = time.time()

all_pdms = leave_one_out_persistence(data, distance_matrix, maxdim=0)
all_pdms

end = time.time()
print("ripser++ total time: ", end-start)