## Name: Signed Distance of features instead of simple distance based kernels.
### Date: 07/8/2024
### Status: Somewhat works. Need to work on the idea more.
### Idea: 
The idea stemmed from thinking that instead of using a distance gramm matrix, which is agnostic of the labels, we could incorporate the labels as well.
So we transfrom D(x,y) = -D(x,y) if y==0 else D(x,y) (with y==1).

### Results:
Seems to work on linear kernel with DT on top.
linear signed is better on 29/63 datasets, it is worse in 24/63 and they tied the rest..

In [None]:
WINS
                  DT  linear_svm  poly_svm  linear_signed_  linear_orig_
DT               0.0        42.0      41.0            47.0          45.0
linear_svm      19.0         0.0      26.0            33.0          34.0
poly_svm        19.0        26.0       0.0            32.0          34.0
linear_signed_  14.0        29.0      28.0             0.0          29.0
linear_orig_    14.0        28.0      27.0            23.0           0.0

In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from pmlb import fetch_data
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.decomposition import PCA
import numpy as np
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [79]:

random_state = 42
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

X, y = load_breast_cancer(return_X_y=True)

In [30]:
from sklearn.base import clone
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import pairwise_distances, pairwise_kernels
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix


kernel = 'linear'

clf = DecisionTreeClassifier(random_state=random_state) # RandomForestClassifier(random_state=random_state)#

y_pred_all = []
y_true_all = []
for train, test in cv.split(X,y):
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    y_train[y_train == 0] = -1
    y_test[y_test == 0] = -1
    
    
    train_2_train = pairwise_kernels(X_train, X_train, metric=kernel) #* y_train 
    
    
    #train_2_train = np.einsum('ij,j->ij',train_2_train, y_train)
    test_2_train = pairwise_kernels(X_test, X_train, metric=kernel) #* y_train
    
    #test_2_train = np.einsum('ij,j->ij',test_2_train, y_train)
    
    cur_clf = clone(clf)
    cur_clf.fit(train_2_train, y_train)
    y_pred = cur_clf.predict(test_2_train)
    y_pred_all.extend(y_pred.tolist())
    y_true_all.extend(y_test.tolist())
    
print(classification_report(y_true_all, y_pred_all))
print(confusion_matrix(y_true_all, y_pred_all))

              precision    recall  f1-score   support

          -1       0.86      0.84      0.85       212
           1       0.91      0.92      0.91       357

    accuracy                           0.89       569
   macro avg       0.88      0.88      0.88       569
weighted avg       0.89      0.89      0.89       569

[[178  34]
 [ 29 328]]


In [31]:
from sklearn.metrics import pairwise_distances, pairwise_kernels


y_pred_all = []
y_true_all = []
for train, test in cv.split(X,y):
    X_train, y_train = X[train], y[train]
    X_test, y_test = X[test], y[test]
    
    y_train[y_train == 0] = -1
    y_test[y_test == 0] = -1
    
    
    
    train_2_train = pairwise_kernels(X_train, X_train, metric=kernel) #* y_train 
    
    y_train_repeated = np.repeat(y_train.reshape(1,-1), repeats=[len(train_2_train)], axis=0)
    
    train_2_train = train_2_train * y_train_repeated
    
    #train_2_train = np.einsum('ij,j->ij',train_2_train, y_train)
    test_2_train = pairwise_kernels(X_test, X_train, metric=kernel) #* y_train
    
    y_train_test_repeated = np.repeat(y_train.reshape(1,-1), repeats=[len(test_2_train)], axis=0)
    
    test_2_train = test_2_train * y_train_test_repeated
    
    #test_2_train = np.einsum('ij,j->ij',test_2_train, y_train)
    
    cur_clf = clone(clf)
    cur_clf.fit(train_2_train, y_train)
    y_pred = cur_clf.predict(test_2_train)
    y_pred_all.extend(y_pred.tolist())
    y_true_all.extend(y_test.tolist())
    
print(classification_report(y_true_all, y_pred_all))
print(confusion_matrix(y_true_all, y_pred_all))

              precision    recall  f1-score   support

          -1       0.87      0.84      0.85       212
           1       0.91      0.92      0.92       357

    accuracy                           0.89       569
   macro avg       0.89      0.88      0.88       569
weighted avg       0.89      0.89      0.89       569

[[178  34]
 [ 27 330]]


# Orig results

As proof of concept it is a bit better than default linear kernel + dt.

## Run it for multiple datasets

In [90]:
from sklearn.base import BaseEstimator, clone
from sklearn.metrics import accuracy_score, f1_score

class KernelBased(BaseEstimator):
    
    
    def __init__(self, 
                 strategy='signed', 
                 metric='linear', 
                 clf=DecisionTreeClassifier(max_depth=None, random_state=random_state)):
        
        self.available_strategies = ['signed', 'orig', 'signed_reduced']
        
        self.clf = clf
        self.strategy = strategy
        if self.strategy not in self.available_strategies:
            raise NotImplementedError(f"Available strategies are: {self.available_strategies}. Was given {self.strategy}")
        self.metric = metric
        self.X_train = []
        self.y_train = []
    
    def fit(self, X, y):
        
        y[y==0] = -1
        if set(y.tolist()) != set([-1,1]):
            raise AttributeError(f"Y is expected to be [-1,1] only but contains: {set(y)}")
        
        self.X_train = X
        self.y_train = y.reshape(1,-1)
        
        X_tr = self.transform(X)
            
        self.clf.fit(X_tr, y)
        return self
    
    def predict(self, X):
        X_tr = self.transform(X)
        return self.clf.predict(X_tr)
        
    def predict_proba(self, X):
        X_tr = self.transform(X)
        return self.clf.predict_proba(X_tr)
    
    def transform(self, X):
        X_2_train = pairwise_kernels(X, self.X_train, metric=self.metric)
        if self.strategy == 'signed':
            y_train_repeated = np.repeat(self.y_train, repeats=[X.shape[0]], axis=0)
            X_2_train = X_2_train * y_train_repeated
        # In this case we simply keep the min, max and mean distance to each label
        # In total this will have 6 features (min, max, mean to class -1 and the same to class 1)
        if self.strategy == 'signed_reduced':
            pos_tr = X_2_train[:, self.y_train.flatten() == 1]
            neg_tr = X_2_train[:, self.y_train.flatten() == -1]
            feats = []
            for tr in [pos_tr, neg_tr]:
                for aggr in [np.min, np.max, np.mean]:
                    feats.append(aggr(tr, axis=1))
            X_2_train = np.vstack(feats).T
            
        return X_2_train

y[y==0] = -1
for strategy in ['orig', 'signed', 'signed_reduced']:
    clf = KernelBased(strategy= strategy, metric='poly')
    print(strategy)
    y_pred = cross_val_predict(clf, X, y, cv=cv)
    print(classification_report(y, y_pred))
    print(confusion_matrix(y, y_pred))
    print('\n\n')

orig
              precision    recall  f1-score   support

          -1       0.86      0.84      0.85       212
           1       0.91      0.92      0.91       357

    accuracy                           0.89       569
   macro avg       0.89      0.88      0.88       569
weighted avg       0.89      0.89      0.89       569

[[178  34]
 [ 28 329]]



signed
              precision    recall  f1-score   support

          -1       0.87      0.84      0.85       212
           1       0.91      0.92      0.92       357

    accuracy                           0.89       569
   macro avg       0.89      0.88      0.88       569
weighted avg       0.89      0.89      0.89       569

[[178  34]
 [ 27 330]]



signed_reduced
              precision    recall  f1-score   support

          -1       0.81      0.82      0.81       212
           1       0.89      0.88      0.89       357

    accuracy                           0.86       569
   macro avg       0.85      0.85      0.85      

## Signed Reduced does not seem to work as is

In [53]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import numpy as np
import pandas as pd
from sympy import re
from torch import rand
import cached_path
import time
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support



random_state = 42

number_of_cv_folds = 5

cv = StratifiedKFold(number_of_cv_folds, random_state=random_state, shuffle=True)

model_names = [
    "DT",
    'linear_svm',
    'poly_svm',
    'linear_orig_',
    'linear_signed_',
    #'poly_orig_',
    #'poly_signed_',
]


def set_seeds(seed=42):
    np.random.seed(seed)
    

set_seeds(random_state)

path_to_data_summary = "https://raw.githubusercontent.com/EpistasisLab/pmlb/master/pmlb/all_summary_stats.tsv"
dataset_df = pd.read_csv(cached_path.cached_path(path_to_data_summary), sep="\t")

classification_datasets = dataset_df[
    # (dataset_df["n_binary_features"] == dataset_df["n_features"])
    (dataset_df["task"] == "classification")
    & (dataset_df["n_classes"] == 2)
    & (dataset_df["n_features"] <= 100)
    & (dataset_df["n_instances"] <= 1000)
]["dataset"]

print(len(classification_datasets))

res = []
for dataset_index, classification_dataset in enumerate(classification_datasets[::-1][1:]):
    
    print(f"{classification_dataset} ({dataset_index + 1}/{len(classification_datasets) + 1})")
    X, y = fetch_data(classification_dataset, return_X_y=True)
    if y.max() != 1 or y.min() != 0:
        for wanted, actual in enumerate(np.unique(y)):
            y[y==actual] = wanted
    y[y==0] = -1

    
        # train_X, test_X, train_y, test_y = train_test_split(
        #     X, y, stratify=y, test_size=0.2, random_state=random_state
        # )
    for model_name in model_names:
        #print(model_name)
        if "DT" in model_name:
            clf = DecisionTreeClassifier(
                random_state=random_state
            )
        elif 'svm' in model_name:
            if 'linear' in model_name:
                clf = SVC()
            else:
                clf = SVC(kernel='poly')
        else: 
            details = model_name.split('_')
            metric, strategy = details[0], details[1]
            clf = KernelBased(strategy=strategy, metric=metric)
        model = clf
        time_s = time.time()

        y_pred = cross_val_predict(model, X, y, cv=cv).astype(int)
        acc = accuracy_score(y, y_pred)
        (prec, rec, f1, sup) = precision_recall_fscore_support(
            y, y_pred, average="binary"
        )
        time_end = time.time() - time_s
        res.append((classification_dataset, model_name, time_end, acc, prec, rec, f1, sup))
        #print(res[-1])

res = pd.DataFrame(res, columns=['dataset', 'model', 'time', 'acc', 'pr', 'rec', 'f1', 'sup'])
# res.sort_values('f1', ascending=False)

# Step 2: Sort each group by 'f1'
sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)

# Step 3: Assign ranks within each group
sorted_df['rank'] = sorted_df.groupby('dataset').cumcount() + 1

# Step 4: Calculate mean rank for each model across all datasets
mean_ranks = sorted_df.groupby('model')['rank'].mean().reset_index().sort_values(by='rank')

print(mean_ranks)

63
wdbc (1/64)
vote (2/64)
tokyo1 (3/64)
tic_tac_toe (4/64)
threeOf9 (5/64)
spectf (6/64)
spect (7/64)
sonar (8/64)
saheart (9/64)
profb (10/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


prnn_synth (11/64)
prnn_crabs (12/64)
postoperative_patient_data (13/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


pima (14/64)
parity5 (15/64)
mux6 (16/64)
monk3 (17/64)
monk2 (18/64)
monk1 (19/64)
molecular_biology_promoters (20/64)
lupus (21/64)
labor (22/64)
irish (23/64)
ionosphere (24/64)
hungarian (25/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


house_votes_84 (26/64)
horse_colic (27/64)
hepatitis (28/64)
heart_statlog (29/64)
heart_h (30/64)
heart_c (31/64)
haberman (32/64)
glass2 (33/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


german (34/64)
diabetes (35/64)
crx (36/64)
credit_g (37/64)
credit_a (38/64)
corral (39/64)
colic (40/64)
cleve (41/64)
bupa (42/64)
buggyCrx (43/64)
breast_w (44/64)
breast_cancer_wisconsin (45/64)
breast_cancer (46/64)
breast (47/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


biomed (48/64)
backache (49/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


australian (50/64)
appendicitis (51/64)
analcatdata_lawsuit (52/64)
analcatdata_japansolvent (53/64)
analcatdata_fraud (54/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


analcatdata_cyyoung9302 (55/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


analcatdata_cyyoung8092 (56/64)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


analcatdata_creditscore (57/64)
analcatdata_boxing2 (58/64)
analcatdata_boxing1 (59/64)
analcatdata_bankruptcy (60/64)
analcatdata_asbestos (61/64)
analcatdata_aids (62/64)
            model      rank
0              DT  2.064516
3      linear_svm  3.032258
4        poly_svm  3.161290
1    linear_orig_  3.354839
2  linear_signed_  3.387097


  sorted_df = res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False)).reset_index(drop=True)


In [59]:
#  res.groupby('dataset').apply(lambda x: x.sort_values(by='f1', ascending=False))
wins_score = np.zeros((len(model_names), len(model_names)))

score_to_use = 'f1'

for classification_dataset in res['dataset'].unique():
    cur_df = res[res['dataset'] == classification_dataset]
    # print(classification_dataset)
    # print(cur_df.sort_values('f1', ascending=False)[['model', 'time', 'acc', 'f1']])
    # print()
    cur_df = cur_df.set_index('model')
    score_metric = cur_df[score_to_use]
    for i, m1 in enumerate(model_names):
        for j, m2 in enumerate(model_names[i:]):
            if cur_df.loc[m1][score_to_use] > cur_df.loc[m2][score_to_use]:
                wins_score[i, j+i] += 1
            elif cur_df.loc[m1][score_to_use] < cur_df.loc[m2][score_to_use]:
                wins_score[j+i, i] += 1
            else:
                pass
order_of_models = wins_score.mean(axis=1).argsort()[::-1]
wins_score = wins_score[order_of_models, :][:, order_of_models]
print('WINS')
print(pd.DataFrame(wins_score, columns = np.array(model_names)[order_of_models], index=np.array(model_names)[order_of_models]))

WINS
                  DT  linear_svm  poly_svm  linear_signed_  linear_orig_
DT               0.0        42.0      41.0            47.0          45.0
linear_svm      19.0         0.0      26.0            33.0          34.0
poly_svm        19.0        26.0       0.0            32.0          34.0
linear_signed_  14.0        29.0      28.0             0.0          29.0
linear_orig_    14.0        28.0      27.0            23.0           0.0
