In [1]:
from XMTR import MTR
import numpy as np
import pandas as pd
from scipy.io import arff
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor

In [4]:
def calc_al_error(instance, perc):
  # the error should be non zero
  return (abs(instance)+0.1)*perc 


# load data
slump_data = arff.loadarff('slump.arff')
slump_df = pd.DataFrame(slump_data[0])

# get column names
column_names = slump_df.columns

# get data/target values
feature_names = column_names[:7]
target_names = column_names[7:]

X = slump_df[feature_names]
y = slump_df[target_names]

# convert to numpy
X = X.to_numpy()
y = y.to_numpy()

# scale target values
scaler = MinMaxScaler(feature_range=(0,10))
y = scaler.fit_transform(y)

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)
allowed_error = 0.1

min_len = 10 # just a ref, should be > than len(feature_names)
for max_d in [2, 5, 10]:
  for min_s_l in [5, 10]:
    for m_f in ['sqrt', 'log2', 0.75, None]:
      for est in [100, 500, 1000]:
        # create/train model
        RFmodel = RandomForestRegressor(n_estimators=est, min_samples_leaf=min_s_l, max_features=m_f, n_jobs=-1, random_state=42)
        RFmodel.fit(X_train, y_train)
        MTR_obj = MTR(model=RFmodel, X_train = X_train, X_test=X_test, y_train=y_train, y_test=y_test, feature_names=feature_names, target_names=target_names)
        avgRuleLengths = 0
        avgEstimators = []
        # get results
        for i in range(len(X_test)):
          error = calc_al_error(y_test[i], allowed_error)
          rule = MTR_obj.explain(X_test[i], error) # explain instance
          feature_limits = MTR_obj.getFeatureLimits()
          avgRuleLengths += len(feature_limits.keys())
          estimators = MTR_obj.getAmountOfReduction() # get estimators
          avgEstimators.append(estimators[0])
        finalLen = avgRuleLengths/len(X_test)
        if finalLen < min_len:
          min_len = finalLen
          best_par = {"max_d=":max_d, "min_s_l=":min_s_l, "m_f=": m_f, "est=": est, "rule length=": finalLen}
        print("max_depth=",max_d," | ", "min_sample_leaf=",min_s_l," | ", "max_f=", m_f," | ", "est=", est," | ", "rule length=", finalLen," | ", "estimators=", round(np.array(avgEstimators).mean(),3),"/",est)

max_depth= 2  |  min_sample_leaf= 5  |  max_f= sqrt  |  est= 100  |  rule length= 7.0  |  estimators= 94.452 / 100
max_depth= 2  |  min_sample_leaf= 5  |  max_f= sqrt  |  est= 500  |  rule length= 7.0  |  estimators= 464.097 / 500
max_depth= 2  |  min_sample_leaf= 5  |  max_f= sqrt  |  est= 1000  |  rule length= 7.0  |  estimators= 932.71 / 1000
max_depth= 2  |  min_sample_leaf= 5  |  max_f= log2  |  est= 100  |  rule length= 7.0  |  estimators= 94.452 / 100
max_depth= 2  |  min_sample_leaf= 5  |  max_f= log2  |  est= 500  |  rule length= 7.0  |  estimators= 464.097 / 500
max_depth= 2  |  min_sample_leaf= 5  |  max_f= log2  |  est= 1000  |  rule length= 7.0  |  estimators= 932.71 / 1000
max_depth= 2  |  min_sample_leaf= 5  |  max_f= 0.75  |  est= 100  |  rule length= 6.774193548387097  |  estimators= 97.097 / 100
max_depth= 2  |  min_sample_leaf= 5  |  max_f= 0.75  |  est= 500  |  rule length= 6.67741935483871  |  estimators= 481.645 / 500
max_depth= 2  |  min_sample_leaf= 5  |  max_f=

In [5]:
best_par

{'max_d=': 2,
 'min_s_l=': 10,
 'm_f=': None,
 'est=': 100,
 'rule length=': 4.612903225806452}