In [1]:
import time
import gc
import gudhi.representations
import numpy as np
import pandas as pd
import gudhi as gd 
import subprocess as sp
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score, mean_squared_error
from persim.persistent_entropy import *

pca = PCA(n_components=2)

In [9]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

housing = fetch_california_housing()

X_all = pd.DataFrame(housing.data, columns = housing.feature_names)
y_all = housing['target']

X_all = X_all.iloc[:3000]
y_all = y_all[:3000]

In [10]:
scaler = StandardScaler()
scaler.fit(X_all)
X_scaled = pd.DataFrame(scaler.transform(X_all), index= X_all.index, columns= X_all.columns)
X_scaled.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.642119,0.828791,0.28199,-0.123927,-0.997996,-0.331205,0.20265,-0.604776
1,2.629035,-0.649917,0.11197,-0.179877,1.296598,-0.881856,0.189239,-0.598947
2,2.055091,1.64208,0.57919,-0.070446,-0.805952,-0.026418,0.182533,-0.610606
3,1.167623,1.64208,0.016067,-0.070863,-0.737523,-0.340607,0.182533,-0.616435
4,0.17977,1.64208,0.121933,-0.06222,-0.729797,-0.793367,0.182533,-0.616435


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.5, random_state=43)

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

RMSE: 0.5199604295289464
R-square: 0.7010017152481347


## Add Topo feature

In [12]:
from joblib import Parallel, delayed
import time
import os

In [14]:
remove_infinity = lambda alpha_list: [alpha for alpha in alpha_list if alpha[1] != np.inf]

def compute_persistence(data_remove):
    print(f"進程 {os.getpid()} 開始計算")
    
    # PCA
    pca_result = pca.fit_transform(data_remove)
    # Alpha complex
    alpha_complex = gd.AlphaComplex(points = pca_result)
    st_alpha = alpha_complex.create_simplex_tree()
    alpha_filtration = st_alpha.get_filtration()
    alpha_list = list(alpha_filtration)
    # Filter
    filtered_alpha_list = remove_infinity(alpha_list)
    dgm = np.array([[0.0, value] for _, value in filtered_alpha_list])
    dgm_filtered = np.array([bar for bar in dgm if bar[1] - bar[0] != 0])
    ## entropy
    PE = gd.representations.Entropy()
    pe_normal = PE.fit_transform([dgm_filtered])
    
    print(f"進程 {os.getpid()} 完成計算")
    return pe_normal

In [13]:
data_list = []
for i in range(X_scaled.shape[0]):
    if (i + 1) % 100 == 0:
        print(f"Processing row {i + 1} out of {X_scaled.shape[0]}")
        gc.collect()
        
    data_remove = X_scaled.drop(index=i)
    data_list.append(data_remove)

Processing row 100 out of 3000
Processing row 200 out of 3000
Processing row 300 out of 3000
Processing row 400 out of 3000
Processing row 500 out of 3000
Processing row 600 out of 3000
Processing row 700 out of 3000
Processing row 800 out of 3000
Processing row 900 out of 3000
Processing row 1000 out of 3000
Processing row 1100 out of 3000
Processing row 1200 out of 3000
Processing row 1300 out of 3000
Processing row 1400 out of 3000
Processing row 1500 out of 3000
Processing row 1600 out of 3000
Processing row 1700 out of 3000
Processing row 1800 out of 3000
Processing row 1900 out of 3000
Processing row 2000 out of 3000
Processing row 2100 out of 3000
Processing row 2200 out of 3000
Processing row 2300 out of 3000
Processing row 2400 out of 3000
Processing row 2500 out of 3000
Processing row 2600 out of 3000
Processing row 2700 out of 3000
Processing row 2800 out of 3000
Processing row 2900 out of 3000
Processing row 3000 out of 3000


In [15]:
start_time = time.time()

# 使用 joblib 進行平行運算
results = Parallel(n_jobs=-1)(delayed(compute_persistence)(data) for data in data_list)

end_time = time.time()
print("平行運算時間: ", end_time - start_time)

平行運算時間:  63.30636692047119


In [16]:
results[0]

array([[2.19216821]])

In [18]:
# Remove Infinity
remove_infinity = lambda alpha_list: [alpha for alpha in alpha_list if alpha[1] != np.inf]

entropy_feature = []

start = time.time()
for i in range(X_scaled.shape[0]):
    if (i + 1) % 100 == 0:
        print(f"Processing row {i + 1} out of {X_scaled.shape[0]}")
        gc.collect()
        
    data_remove = X_scaled.drop(index=i)
    
    # PCA
    pca_result = pca.fit_transform(data_remove)
    
    # Alpha complex
    alpha_complex = gd.AlphaComplex(points = pca_result)
    st_alpha = alpha_complex.create_simplex_tree()
    alpha_filtration = st_alpha.get_filtration()
    alpha_list = list(alpha_filtration)
    
    # Filter
    filtered_alpha_list = remove_infinity(alpha_list)
    dgm = np.array([[0.0, value] for _, value in filtered_alpha_list])
    dgm_filtered = np.array([bar for bar in dgm if bar[1] - bar[0] != 0])
    
    ## entropy
    PE = gd.representations.Entropy()
    pe_normal = PE.fit_transform([dgm_filtered])
    
    entropy_feature.append(pe_normal)
    
end = time.time()
print("ripser++ total time: ", end-start)

KeyboardInterrupt: 

In [11]:
entropy_feature[:5]

[array([[0.69681062]]),
 array([[0.9978935]]),
 array([[0.69627529]]),
 array([[0.71394814]]),
 array([[0.69763886]])]

In [12]:
results[:5]

[array([[0.69681062]]),
 array([[0.9978935]]),
 array([[0.69627529]]),
 array([[0.71394814]]),
 array([[0.69763886]])]

In [20]:
# X_scaled['topo_feature'] = [x.tolist()[0][0] if isinstance(x, np.ndarray) else x for x in entropy_feature]
# X_scaled['topo_feature'] = scaler.fit_transform(X_scaled[['topo_feature']])
# X_scaled.head()

In [54]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.5, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

RMSE: 0.5545262813691066
R-square: 0.6644448697913721
