In [None]:
%load_ext autoreload
%autoreload 2

In [85]:
import pandas as pd
import cached_path
from pmlb import fetch_data
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import StratifiedKFold, cross_val_predict
import time
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from scipy.special import softmax
from sklearn.base import clone
from sklearn.ensemble import VotingClassifier



random_state = 42



path_to_data_summary = "https://raw.githubusercontent.com/EpistasisLab/pmlb/master/pmlb/all_summary_stats.tsv"
dataset_df = pd.read_csv(cached_path.cached_path(path_to_data_summary), sep="\t")

classification_datasets = dataset_df[
    # (dataset_df["n_binary_features"] == dataset_df["n_features"])
    (dataset_df["task"] == "classification")
    & (dataset_df["n_classes"] == 2)
    & (dataset_df["n_features"] <= 150)
    # & (dataset_df["n_features"] >= 10)
    & (dataset_df["n_instances"] <= 1000)
]["dataset"][:]

print(len(classification_datasets))




number_of_cv_folds = 5
num_estimators = 100
max_depth = None


models = {
    "Baseline": {},
    #"Ensemble": {},
    "Local_RFEmb_DT":{"meta":"DT"},
    "Local_RFEmb_LR":{"meta":"LR"},
    "RFEmb_LR":{"meta":"LR", "max_depth":max_depth, "n_estimators":num_estimators},
    #"RFEmb_DT":{"meta":"DT", "max_depth":max_depth, "n_estimators":num_estimators},

}

cv = StratifiedKFold(number_of_cv_folds, random_state=random_state, shuffle=True)
base_class = RandomForestClassifier(n_estimators=num_estimators, max_depth=max_depth, random_state=42)
  ##DecisionTreeClassifier(max_depth=None, random_state=42)#

res = [] 
for dataset_index, classification_dataset in enumerate(classification_datasets[::-1][:]):
    
    print(f"{classification_dataset} ({dataset_index + 1}/{len(classification_datasets) + 1})")
    if 'deprecated' in classification_dataset:
        print(f"Skipping {classification_dataset} as deprecated from PMLB...")
        continue
    try:
        X, y = fetch_data(classification_dataset, return_X_y=True)
    except ValueError as e:
        print(f'Probably not found dataset {classification_dataset} in PMLB and skipping...\n {e}')
        continue
    if y.max() != 1 or y.min() != 0:
        for wanted, actual in enumerate(np.unique(y)):
            y[y==actual] = wanted
        
    imb_ratio = np.bincount(y).max() / np.bincount(y).min()
    print(f"{X.shape} with ratio : {imb_ratio:.4f}\n")
    

    for model_name, model_kwargs in models.items():
        y_pred = np.empty_like(y)
        sample_weights = None
        time_s = time.time()
        for train_indices, test_indices in cv.split(X,y):
            X_train, y_train = X[train_indices], y[train_indices]
            X_test, y_test = X[test_indices], y[test_indices]
            
            X_train_filtered = X_train.copy()
            y_train_filtered = y_train.copy()
            if model_name.startswith("RFEmb"):
                clf = RFEmb(**model_kwargs)
            elif  model_name.startswith("Ensemble"):
                clf =  VotingClassifier(estimators=[('lr', LogisticRegression(random_state=random_state, class_weight='balanced')), 
                                                    ('rf', clone(base_class))], voting='soft')
            elif model_name.startswith("Local_"):
                clf = RFEmbLocalLr(**model_kwargs)
            else:
                clf = clone(base_class)
            #print(model_name, X_train_filtered.shape[0])
            clf.fit(X_train_filtered , y_train_filtered)
            y_pred_cur = clf.predict(X_test)

            y_pred[test_indices] = y_pred_cur
            #print(f'TRUE', y_test)
            
        
        
        acc = accuracy_score(y, y_pred)
        (prec, rec, f1, sup) = precision_recall_fscore_support(
            y, y_pred, average="binary"
        )
            
        
        print(model_name)    
        print(classification_report(y, y_pred))
        time_end = time.time() - time_s

        res.append((classification_dataset, imb_ratio, model_name, time_end, acc, prec, rec, f1))
        #break
        
res = pd.DataFrame(res, columns=['dataset', 'dataset_class_imb', 'model', 'time', 'acc', 'pr', 'rec', 'f1'])

# Step 2: Sort each group by 'f1'
sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)

# Step 3: Assign ranks within each group
sorted_df['rank'] = sorted_df.groupby('dataset').cumcount() + 1

# Step 4: Calculate mean rank for each model across all datasets
mean_ranks = sorted_df.groupby('model')['rank'].mean().reset_index().sort_values(by='rank')

print(mean_ranks)
            

73
xd6 (1/74)
(973, 9) with ratio : 2.0217

Baseline
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       651
           1       1.00      1.00      1.00       322

    accuracy                           1.00       973
   macro avg       1.00      1.00      1.00       973
weighted avg       1.00      1.00      1.00       973

Local_RFEmb_DT
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       651
           1       1.00      1.00      1.00       322

    accuracy                           1.00       973
   macro avg       1.00      1.00      1.00       973
weighted avg       1.00      1.00      1.00       973

Local_RFEmb_LR
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       651
           1       1.00      1.00      1.00       322

    accuracy                           1.00       973
   macro avg       1.00      1.00      1.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Baseline
              precision    recall  f1-score   support

           0       0.86      1.00      0.93       155
           1       0.00      0.00      0.00        25

    accuracy                           0.86       180
   macro avg       0.43      0.50      0.46       180
weighted avg       0.74      0.86      0.80       180

Local_RFEmb_DT
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       155
           1       0.00      0.00      0.00        25

    accuracy                           0.86       180
   macro avg       0.43      0.50      0.46       180
weighted avg       0.74      0.86      0.79       180

Local_RFEmb_LR
              precision    recall  f1-score   support

           0       0.86      0.99      0.92       155
           1       0.00      0.00      0.00        25

    accuracy                           0.86       180
   macro avg       0.43      0.50      0.46       180
weighted avg       0.74      0.86  

  sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)


In [90]:
(100*res.groupby('model')['f1'].mean()).astype(str) + " ± " + (100*res.groupby('model')['f1'].std()).astype(str)

model
Baseline           72.96602260241028 ± 28.99319541196636
Local_RFEmb_DT    72.86766629311722 ± 28.720729218495112
Local_RFEmb_LR    73.15306757332732 ± 28.564311133629243
RFEmb_LR          75.66004000695199 ± 26.050665314526324
Name: f1, dtype: object

In [81]:
class RFEmbLocalLr(BaseEstimator):
    
    
    def __init__(self, n_estimators=100, max_depth = 10, random_state=42, meta='DT'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.embedder = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=self.random_state)
        if meta == "DT":
            self.final_clf = DecisionTreeClassifier(random_state=self.random_state)
        elif meta == 'LR':
            self.final_clf = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
            
    def fit(self, X, y):
        self.embedder.fit(X,y)
        self.X_train = X
        self.y_train = y
        self.X_train_to_leaves = self.embedder.apply(X)
        
    
    def predict_proba(self, X):
        preds = []
        for x in X:
            query_leaves = self.embedder.apply(x.reshape(1,-1))
            mask = (query_leaves == self.X_train_to_leaves).mean(axis=1) > 0.5
            num_to_use = (query_leaves == self.X_train_to_leaves).sum(axis=1)
            X_to_use, y_to_use = [], []
            for train_index, to_use in enumerate(mask):
                if to_use:
                    num_iter = num_to_use[train_index]
                    X_to_use.extend([self.X_train[train_index] for _ in range(num_iter)])
                    y_to_use.extend([self.y_train[train_index] for _ in range(num_iter)])
            if len(X_to_use) > 0:
                X_to_use = np.vstack(X_to_use)
                y_to_use = np.array(y_to_use)
                if y_to_use.mean() == 1:
                    probas = np.array([0,1]).reshape(1,2)
                elif y_to_use.mean() == 0:
                    probas = np.array([1,0]).reshape(1,2)
                else:
                    clf = clone(self.final_clf)
                    #print(X_to_use.shape, y_to_use.shape, y_to_use)
                    clf.fit(X_to_use, y_to_use)
                    probas = clf.predict_proba(x.reshape(1,-1)).reshape(1,2)
            else:
                probas = self.embedder.predict_proba(x.reshape(1,-1)).reshape(1,2)
            
            
            # to_use_mask  = (query_leaves == self.X_train_to_leaves).sum(axis=1)>0
            # print(to_use_mask.sum(), self.X_train.shape[0],  (query_leaves == self.X_train_to_leaves).sum(axis=1).sum())
            # #print(self.X_train.shape, to_use_mask.shape)
            # if self.y_train[to_use_mask].mean() == 1:
            #     probas = np.array([0,1]).reshape(1,2)
            # elif self.y_train[to_use_mask].mean() == 0:
            #     probas = np.array([1,0]).reshape(1,2)
            # else:
            #     clf = clone(self.final_clf)
            #     clf.fit(self.X_train[to_use_mask], self.y_train[to_use_mask])
            #     probas = clf.predict_proba(x.reshape(1,-1)).reshape(1,2)
            preds.append(probas)
        return np.array(preds).reshape(-1,2)
    
    def predict(self, X):
        probas = self.predict_proba(X)
        #print(probas.shape)
        return (probas[:, 1] > 0.5).astype(int)
    
clf = RFEmbLocalLr(meta='LR')
clf.fit(X_train, y_train)
clf.predict(X_train)

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,

In [70]:
X_train.shape

(409, 9)

In [None]:
X_train

In [36]:
from sklearn.base import BaseEstimator


class RFEmb(BaseEstimator):
    
    
    def __init__(self, n_estimators=100, max_depth = 10, random_state=42, meta='LR'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.random_state = random_state
        self.embedder = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=self.random_state)
        if meta == "DT":
            self.final_clf = DecisionTreeClassifier(random_state=self.random_state)
        elif meta == 'LR':
            self.final_clf = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
            
    def fit(self, X, y):
        self.embedder.fit(X,y)
       
        X_emb, _ = self.embedder.decision_path(X)
        self.final_clf.fit(X_emb, y)
    
    def predict_proba(self, X):
        X_emb, _ = self.embedder.decision_path(X)    
        return self.final_clf.predict_proba(X_emb)
    
    def predict(self, X):
        X_emb, _ = self.embedder.decision_path(X)      
        return self.final_clf.predict(X_emb)
clf = RFEmb(meta='DT')
clf.fit(X_train, y_train)
clf.predict(X_train)

array([1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,

In [6]:
X_test.shape

(191, 44)

In [7]:
clf.n_estimators

100

In [None]:
clf.