In [1]:
import numpy as np
import pandas as pd
import ripser
from persim.persistent_entropy import *
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

from sklearn import datasets

In [2]:
iris = datasets.load_iris()
X = pd.DataFrame(iris.data, columns = iris.feature_names)
y = iris['target']

In [3]:
entropy_feature = []
for i in range(X.shape[0]):

    data_remove = X.drop(index=i)
    dgm = ripser.ripser(data_remove)['dgms'][0]
    p_entropy = persistent_entropy(dgm)
    entropy_feature.append(p_entropy)

In [4]:
X['topo_feature'] = [x.tolist()[0] if isinstance(x, np.ndarray) else x for x in entropy_feature]
X.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),topo_feature
0,5.1,3.5,1.4,0.2,4.87214
1,4.9,3.0,1.4,0.2,4.871527
2,4.7,3.2,1.3,0.2,4.870066
3,4.6,3.1,1.5,0.2,4.870673
4,5.0,3.6,1.4,0.2,4.871399


In [5]:
X_final = X[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.5, random_state=42)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## DecisionTree & Regression

In [7]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train_scaled, y_train)
dtree.score(X_test_scaled, y_test)

0.9333333333333333

In [8]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled,y_train)

training_prediction = log_reg.predict(X_train_scaled)
test_prediction = log_reg.predict(X_test_scaled)

print("Precision, Recall, Confusion matrix, in training\n")
print(metrics.confusion_matrix(y_train, training_prediction))
print(metrics.accuracy_score(y_train, training_prediction))

Precision, Recall, Confusion matrix, in training

[[21  0  0]
 [ 0 24  3]
 [ 0  2 25]]
0.9333333333333333


### Adding Topology feature

In [9]:
X_final = X[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'topo_feature']].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.5, random_state=42)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
dtree = DecisionTreeClassifier()
dtree.fit(X_train_scaled, y_train)
dtree.score(X_test_scaled, y_test)

0.92

In [12]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled,y_train)

training_prediction = log_reg.predict(X_train_scaled)
test_prediction = log_reg.predict(X_test_scaled)

print("Precision, Recall, Confusion matrix, in training\n")
print(metrics.confusion_matrix(y_train, training_prediction))
print(metrics.accuracy_score(y_train, training_prediction))

Precision, Recall, Confusion matrix, in training

[[21  0  0]
 [ 0 25  2]
 [ 0  1 26]]
0.96
