### install from https://github.com/amish-mishra/cechmate-DR/tree/master

In [1]:
import cechmate as cm
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from persim.persistent_entropy import *
import gudhi.representations
import gudhi as gd 

In [7]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

housing = fetch_california_housing()

X_all = pd.DataFrame(housing.data, columns = housing.feature_names)
y_all = housing['target']

X_all = X_all#.iloc[:1000]
y_all = y_all#[:1000]

In [8]:
scaler = StandardScaler()
scaler.fit(X_all)
X_scaled = pd.DataFrame(scaler.transform(X_all), index= X_all.index, columns= X_all.columns)
X_scaled.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.5, random_state=43)

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

RMSE: 0.7195934214967924
R-square: 0.6113250004234638


In [10]:
from joblib import Parallel, delayed
import time
import os
import gc

In [11]:
remove_infinity = lambda alpha_list: [alpha for alpha in alpha_list if alpha[1] != np.inf]
pca = PCA(n_components=2)

def compute_DR(data_remove):
    # print(f"進程 {os.getpid()} 開始計算")
    
    # PCA
    pca_result = pca.fit_transform(data_remove)
    
    # DR complex
    del_rips = cm.DR(maxdim=1)
    filtration = del_rips.build(pca_result)
    dgms_del_rips = del_rips.diagrams(filtration, verbose=False)

    ## entropy
    PE = gd.representations.Entropy()
    pe_normal = PE.fit_transform([dgms_del_rips[0]])
    
    # print(f"進程 {os.getpid()} 完成計算")
    return pe_normal

def compute_Alpha(data_remove):
    # print(f"進程 {os.getpid()} 開始計算")
    
    # PCA
    pca_result = pca.fit_transform(data_remove)
    # Alpha complex
    alpha_complex = gd.AlphaComplex(points = pca_result)
    st_alpha = alpha_complex.create_simplex_tree()
    alpha_filtration = st_alpha.get_filtration()
    alpha_list = list(alpha_filtration)
    # Filter
    filtered_alpha_list = remove_infinity(alpha_list)
    dgm = np.array([[0.0, value] for _, value in filtered_alpha_list])
    dgm_filtered = np.array([bar for bar in dgm if bar[1] - bar[0] != 0])
    ## entropy
    PE = gd.representations.Entropy()
    pe_normal = PE.fit_transform([dgm_filtered])
    
    # print(f"進程 {os.getpid()} 完成計算")
    return pe_normal

In [13]:
data_list = []
for i in range(X_scaled.shape[0]):
    if (i + 1) % 500 == 0:
        print(f"Processing row {i + 1} out of {X_scaled.shape[0]}")
        gc.collect()
        
    data_remove = X_scaled.drop(index=i)
    data_list.append(data_remove)

Processing row 500 out of 1000
Processing row 1000 out of 1000


## Delaunay-Rips

In [None]:
start_time = time.time()

# 使用 joblib 進行平行運算
results = Parallel(n_jobs=-1)(delayed(compute_DR)(data) for data in data_list)

end_time = time.time()
print("平行運算時間: ", end_time - start_time)

## Alpha

In [26]:
start_time = time.time()

# 使用 joblib 進行平行運算
results = Parallel(n_jobs=-1)(delayed(compute_Alpha)(data) for data in data_list)

end_time = time.time()
print("平行運算時間: ", end_time - start_time)

平行運算時間:  3471.6376938819885


In [13]:
import pickle
# with open('Calculated/results.pkl', 'wb') as f:
#     pickle.dump(results, f)

with open('Calculated/results.pkl', 'rb') as file:
    results = pickle.load(file)

In [33]:
# results

In [17]:
X_scaled['topo_feature'] = [x.tolist()[0][0] if isinstance(x, np.ndarray) else x for x in results]
X_scaled['topo_feature'] = scaler.fit_transform(X_scaled[['topo_feature']])
X_scaled.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,topo_feature
0,2.344766,0.982143,0.628559,-0.153758,-0.974429,-0.049597,1.052548,-1.327835,1.166067
1,2.332238,-0.607019,0.327041,-0.263336,0.861439,-0.092512,1.043185,-1.322844,0.69688
2,1.782699,1.856182,1.15562,-0.049016,-0.820777,-0.025843,1.038503,-1.332827,1.591508
3,0.932968,1.856182,0.156966,-0.049833,-0.766028,-0.050329,1.038503,-1.337818,0.266836
4,-0.012881,1.856182,0.344711,-0.032906,-0.759847,-0.085616,1.038503,-1.337818,0.343667


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_all, test_size=0.5, random_state=42)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = pd.Series(y_train).reset_index(drop=True)
y_test = pd.Series(y_test).reset_index(drop=True)

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_lr = lr.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, predictions_lr))
r2 = r2_score(y_test, predictions_lr)

print('RMSE:', rmse)
print('R-square:', r2)

RMSE: 0.7304755773405381
R-square: 0.5974297708138535
