# Heart Statlog dataset with LionForests V2

In [1]:
import sys
cpath = !pwd
sys.path.append('C:\\Users\\iamollas\\Downloads\\LionForests Journal\\')

In [2]:
from lionforests import LionForests
from datasets.dataset import Dataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
np.seterr(invalid='ignore')
import warnings
warnings.filterwarnings("ignore")
import time
import urllib

Firstly, we load the dataset and we set the feature and class names

In [3]:
heart = Dataset()
X, y, feature_names, class_names = heart.load_heart()

Which has 270 instances

In [4]:
len(X)

270

Then we initiate LionForests

In [5]:
parameters = [{
    'max_depth': [5],
    'max_features': ['sqrt'],
    'bootstrap': [False],
    'min_samples_leaf' : [5],
    'n_estimators': [500]
}]
scaler = MinMaxScaler(feature_range=(-1,1))
lf = LionForests(None, False, scaler, feature_names, class_names)
lf.fit(X, y, params=parameters)

The model achieves very high performance, based on F1-Weighted score

In [6]:
lf.accuracy

0.8460024945441493

## Qualitative
Then we examine the rule interpretation of an example

In [7]:
lf.explain(X[40], True, 'apriori', 'kmedoids')[0]

'if 6.5<=reversable defect<=7.0 & 3.5<=chest pain<=4.0 & 0.0<=number of major vessels<=0.5 & 0.0<=oldpeak<=0.05 & 0.0<=exercise induced angina<=0.5 & 177.5<=maximum heart rate achieved<=181.5 & 1.0<=the slope of the peak exercise<=1.5 & 29.0<=age<=42.5 & 222.5<=serum cholestoral<=224.5 & 0.5<=sex<=1.0 & 151.0<=resting blood pressure<=156.5 & 0.0<=resting electrocardiographic results<=0.5 & 0.0<=fasting blood sugar<=0.5 then absence'

Moreover, we can present a UI tool to explore the interpretation visually

In [8]:
lf.visualize(lf.explain(X[40], True, 'apriori', 'kmedoids', to_vis=True))

Prediction rule: if 6.5<=reversable defect<=7.0 & 3.5<=chest pain<=4.0 & 0.0<=number of major vessels<=0.5 & 0.0<=oldpeak<=0.05 & 0.0<=exercise induced angina<=0.5 & 177.5<=maximum heart rate achieved<=181.5 & 1.0<=the slope of the peak exercise<=1.5 & 29.0<=age<=42.5 & 222.5<=serum cholestoral<=224.5 & 0.5<=sex<=1.0 & 151.0<=resting blood pressure<=156.5 & 0.0<=resting electrocardiographic results<=0.5 & 0.0<=fasting blood sugar<=0.5 then absence


HBox(children=(IntSlider(value=1, continuous_update=False, description='Feature: ', max=13, min=1), ToggleButt…

Output()

In [9]:
list(zip(feature_names, X[40]))

[('age', 40.0),
 ('sex', 1.0),
 ('chest pain', 4.0),
 ('resting blood pressure', 152.0),
 ('serum cholestoral', 223.0),
 ('fasting blood sugar', 0.0),
 ('resting electrocardiographic results', 0.0),
 ('maximum heart rate achieved', 181.0),
 ('exercise induced angina', 0.0),
 ('oldpeak', 0.0),
 ('the slope of the peak exercise', 1.0),
 ('number of major vessels', 0.0),
 ('reversable defect', 7.0)]

We can change ourselves an instances values and see if and how the outcome will change! We will change age's value to 35

In [10]:
temp = X[40].copy()
temp[0] = 35
lf.explain(temp, True, 'apriori', 'kmedoids')[0]

'if 6.5<=reversable defect<=7.0 & 3.5<=chest pain<=4.0 & 0.0<=number of major vessels<=0.5 & 0.0<=oldpeak<=0.05 & 0.0<=exercise induced angina<=0.5 & 177.5<=maximum heart rate achieved<=181.5 & 1.0<=the slope of the peak exercise<=1.5 & 29.0<=age<=42.5 & 222.5<=serum cholestoral<=224.5 & 0.5<=sex<=1.0 & 151.0<=resting blood pressure<=156.5 & 0.0<=resting electrocardiographic results<=0.5 & 0.0<=fasting blood sugar<=0.5 then absence'

The outcome remained the same!

# Sensitivity analysis!
We will now test how the random forest's parameters are affecting the reduction of paths and features

In [None]:
from tqdm import tqdm
import time  
results = []
iterr = 100
with tqdm(total=4*4*4*24) as pbar:
    for i in [1, 5, 7, 10]:
        for j in [10, 100, 500, 1000]:
            for o in ['sqrt','log2', 0.75, None]:
                parameters = [{
                    'max_depth': [i],
                    'max_features': [o],
                    'bootstrap': [False],
                    'min_samples_leaf' : [5],
                    'n_estimators': [j]
                }]
                lf = LionForests(class_names=class_names)
                scaler = MinMaxScaler(feature_range=(-1,1))
                lf.fit(X, y, scaler, feature_names, parameters)
                tlf = 0
                fr = 0
                pr = 0
                for inde in range(iterr):
                    t_a = time.time()
                    a, b, c, d, e, f = lf.explain(X[inde], False, None, None, False)
                    fr = fr + c - e
                    pr = pr + b - d
                    tlf = tlf + time.time() - t_a
                results.append([str(i),str(j),str(o),'NoRed','_','_', tlf/iterr, fr/iterr, pr/iterr])
                pbar.update(1)
                for k in ['1','2','3','12','13','23','123']:
                        if '1' in k and '2' in k:
                            for ara in ['apriori','fpgrowth']:
                                for cla in ['kmedoids','OPTICS','SC']:
                                    tlf = 0
                                    fr = 0
                                    pr = 0
                                    for inde in range(iterr):
                                        t_a = time.time()
                                        a, b, c, d, e, f = lf.explain(X[inde], True, ara, cla, method=k)
                                        fr = fr + c - e
                                        pr = pr + b - d
                                        tlf = tlf + time.time() - t_a
                                    results.append([str(i),str(j),str(o),k,ara,cla, tlf/iterr, fr/iterr, pr/iterr])
                                    pbar.update(1)
                        elif '1' in k:
                            for ara in ['apriori','fpgrowth']:
                                tlf = 0
                                fr = 0
                                pr = 0
                                for inde in range(iterr):
                                    t_a = time.time()
                                    a, b, c, d, e, f = lf.explain(X[inde], True, ara, None, method=k)
                                    fr = fr + c - e
                                    pr = pr + b - d
                                    tlf = tlf + time.time() - t_a
                                results.append([str(i),str(j),str(o),k,ara,'_', tlf/iterr, fr/iterr, pr/iterr])
                                pbar.update(1)
                        elif '2' in k:
                            for cla in ['kmedoids','OPTICS','SC']: 
                                tlf = 0
                                fr = 0
                                pr = 0
                                for inde in range(iterr):
                                    t_a = time.time()
                                    a, b, c, d, e, f = lf.explain(X[inde], True, None, cla, method=k)
                                    fr = fr + c - e
                                    pr = pr + b - d
                                    tlf = tlf + time.time() - t_a
                                results.append([str(i),str(j),str(o),k,'_',cla, tlf/iterr, fr/iterr, pr/iterr])
                                pbar.update(1)
                        else:
                                tlf = 0
                                fr = 0
                                pr = 0
                                for inde in range(iterr):
                                    t_a = time.time()
                                    a, b, c, d, e, f = lf.explain(X[inde], True, None, None, method=k)
                                    fr = fr + c - e
                                    pr = pr + b - d
                                    tlf = tlf + time.time() - t_a
                                results.append([str(i),str(j),str(o),k,'_','_', tlf/iterr, fr/iterr, pr/iterr])
                                pbar.update(1)

In [None]:
for i in results:
    print(i)

Another test we are going to make concerns the speed of LionForests V2 to original LionForests!

In [None]:
results = []
iterr = 100
with tqdm(total=4*4*4*2) as pbar:
    for i in [1,5,7,10]:
        for j in [10, 100, 500, 1000]:
            for o in ['sqrt','log2', 0.75, None]:
                parameters = [{
                    'max_depth': [i],
                    'max_features': [o],
                    'bootstrap': [False],
                    'min_samples_leaf' : [5],
                    'n_estimators': [j]
                }]
                lf = LionForests(class_names=class_names)
                scaler = MinMaxScaler(feature_range=(-1,1))
                lf.fit(X, y, scaler, feature_names, parameters)
                tlf = 0
                fr = 0
                pr = 0
                for inde in range(iterr):
                    t_a = time.time()
                    a, b, c, d, e, f = lf.explain(X[inde], False, None, None, True)
                    fr = fr + c - e
                    pr = pr + b - d
                    tlf = tlf + time.time() - t_a
                results.append([str(i),str(j),str(o),'NoRed_Old','_','_', tlf/iterr, fr/iterr, pr/iterr])
                pbar.update(1)
                for inde in range(iterr):
                    t_a = time.time()
                    a, b, c, d, e, f = lf.explain(X[inde], True, 'apriori', 'kmedoids', True)
                    fr = fr + c - e
                    pr = pr + b - d
                    tlf = tlf + time.time() - t_a
                results.append([str(i),str(j),str(o),'123_Old','_','_', tlf/iterr, fr/iterr, pr/iterr])
                pbar.update(1)

In [None]:
for i in results:
    print(i)