## Imports and Setup

In [1]:
from itertools import islice
from parametric_tsne import ParametricTSNE
from river import drift
from river import synth
from river import ensemble, linear_model
from river import metrics, evaluate, datasets, tree, preprocessing, base
import numpy as np
import pandas as pd
import tensorflow as tf
tf.config.run_functions_eagerly(True)
physical_devices = tf.config.list_physical_devices('GPU') 
tf.config.experimental.set_memory_growth(physical_devices[0], True)

## Auxiliar functions

In [2]:
def matrix_to_river_iterator(X, y=None, classes=None):
    if classes == None:
        classes = ["x","y"] + [chr(97 + i) for i in range(len(X[0])-2)]
    if y is None:
        y = [False for i in range(len(X))]
    dict_list = []
    for instance, y_pred in zip(X,y):
        _instance = (dict(zip(classes,instance)), y_pred)
        dict_list.append(_instance)
    
    return islice(dict_list,0)


def matrix_to_dict(X):
    classes = ["x","y"]
    
    dict_list = []
    for instance in X:
        dict_list.append(dict(zip(classes,instance)))
    
    return dict_list


def dict_to_highest_class(y):
    highest_classes = []
    for instance in y:
        highest_classes.append(list({k: v for k, v in sorted(instance.items(), key=lambda item: item[1])}.keys())[-1])
    
    return highest_classes


def round_probs(d):
    for key in d.keys():
        d[key] = round(d[key])
    
    return d

## TSNE Classifier

In [3]:
class TSNEClassifier(base.Classifier):
    def __init__(self, classifier, n_components=2, perplexity=30., verbose=0):
        self.classifier = classifier
        self.tsne = ParametricTSNE(n_components, perplexity, verbose)
        self._x_instances = []
        self._y_instances = []
        
    def fit(self, X, y=None, batch_size=100, n_iter_tsne=100):
        if batch_size > len(X):
            batch_size = len(X)
            
        self.tsne.fit(X,y,batch_size=batch_size, n_iter=n_iter_tsne)
        X_new = self.tsne.transform(X)
        for instance_x, instance_y in zip(X_new, y):
            self.classifier.learn_one({'x':instance_x[0], 'y':instance_x[1]}, instance_y)
        
        return self
    
    def predict(self, X, y=None):
        X_new = self.tsne.transform(X)
        X_new = matrix_to_dict(X_new)
        if not self.classifier.predict_one:
            return [round_probs(self.classifier.predict_proba_one(instance)) for instance in X_new]
        
        return [self.classifier.predict_one(instance) for instance in X_new]
    
    def predict_proba(self, X):
        X_new = self.tsne.transform(X)
        X_new = matrix_to_dict(X_new)
        return [self.classifier.predict_proba_one(instance) for instance in X_new]
    
    def learn_one(self, x, y=None):
        if len(self._x_instances) < self._batch_size:
            self._x_instances.append(x)
            self._y_instances.append(y)
            return self
        
        result = self.fit(np.array(self._x_instances), np.array(self._y_instances))
        self._x_instances = []
        self._y_instances = []
        return result
        
        '''
        x_list = list(x.values())
        x = np.asarray(x_list).reshape(1,len(x_list))
        self.tsne.fit(x,y)
        X_new = self.tsne.transform(x)
        return self.classifier.learn_one({'x': X_new[0][0], 'y': X_new[0][1]}, y)
        '''
    
    def predict_one(self, x, y=None):
        x_list = list(x.values())
        X_new = self.tsne.transform(np.asarray(x_list).reshape(1,len(x_list)))
        X_new = matrix_to_dict(X_new)[0]
        return self.classifier.predict_one(X_new)
    
    def predict_proba_one(self, x, y=None):
        X_new = self.tsne.transform(x.values())
        X_new = matrix_to_dict(X_new)
        return self.classifier.predict_proba_one(X_new)
    

## Experiments

### Classifiers and Datasets

In [4]:
classifiers = [ensemble.ADWINBaggingClassifier(model=linear_model.LogisticRegression(), n_models=10, seed=None),
              ensemble.AdaBoostClassifier(model=(tree.HoeffdingTreeClassifier(split_criterion='gini', split_confidence=1e-5, grace_period=2000)), n_models=10, seed=None),
              ensemble.AdaptiveRandomForestClassifier(n_models=10, max_features="sqrt", lambda_value=6, metric=metrics.Accuracy(), disable_weighted_vote=False, drift_detector=drift.ADWIN(), warning_detector=drift.ADWIN(), grace_period=50, max_depth=None, split_criterion="info_gain", split_confidence=0.01, tie_threshold=0.05, leaf_prediction="nba", nb_threshold=0, nominal_attributes=None, splitter=None, max_size=32, memory_estimate_period=2000000, seed=None),
              ensemble.BaggingClassifier(model=(preprocessing.StandardScaler() | linear_model.LogisticRegression()), n_models=10, seed=None),
              ensemble.LeveragingBaggingClassifier(model=(preprocessing.StandardScaler() | linear_model.LogisticRegression()), n_models=10, w=6, adwin_delta=0.002, bagging_method="bag", seed=None)]

In [5]:
def generate_datasets(n_features, n_cat_features, seed=None): # 50/50 pra num/cat
    return {
        "Hyperplane": synth.Hyperplane(seed=seed, n_features=n_features),
        "RandomRBF": synth.RandomRBF(seed_sample=seed, n_features=n_features),
        "RandomRBFDrift": synth.RandomRBFDrift(seed_sample=seed, n_features=n_features),
        "RandomTree": synth.RandomTree(seed_sample=seed, n_num_features=n_features, n_cat_features=n_cat_features)
    }

In [6]:
datasets = [synth.Hyperplane(seed=None, n_features=10, n_drift_features=2, mag_change=0.0, noise_percentage=0.05, sigma=0.1), 
           synth.RandomRBF(seed_model=None, seed_sample=None, n_classes=2, n_features=10, n_centroids=50),
           synth.RandomRBFDrift(seed_model=None, seed_sample=None, n_classes=2, n_features=10, n_centroids=50, change_speed=0.0, n_drift_centroids=50),
           synth.RandomTree(seed_tree=None, seed_sample=None, n_classes=2, n_num_features=5, n_cat_features=5, n_categories_per_feature=5, max_tree_depth=5, first_leaf_level=3, fraction_leaves_per_level=0.15)]

#número de features varia entre 50,100,200,500,1000,2000

### Classifiers vs. Datasets

In [6]:
import matplotlib.pyplot as plt
from IPython.display import clear_output

def evaluate_tsne_classifiers_datasets(classifiers, datasets, n, m):
    results = {}
    for classifier in classifiers:
        for dataset_name, dataset in datasets.items():
            tsne_classifier = TSNEClassifier(classifier)
            data= np.asarray(list(dataset.take(n)))
            dataset_x = np.asarray([np.asarray(list(a[0].values())) for a in data])
            dataset_y = np.asarray([a[1] for a in data])
            
            for x, y in zip(dataset_x,dataset_y):
                tsne_classifier.learn_one(x,y)
            
            pred_data = list(dataset.take(m))
            pred_x_data = np.asarray([np.asarray(list(a[0].values())) for a in pred_data])
            true_pred_y = np.asarray([a[1] for a in pred_data])
            
            metric = metrics.Accuracy()
            y_pred = dict_to_highest_class(tsne_classifier.predict_proba(pred_x_data))
            for yt, yp in zip(true_pred_y, y_pred):
                metric = metric.update(yt, yp)
            results[classifier.__class__.__name__ + "_" + dataset_name] = metric
            
            clear_output(wait=True)
            print(classifier.__class__.__name__ + "_" + dataset_name, metric)
    
    return results
            

def evaluate_classifiers_datasets(classifiers, datasets, n, m):
    results = {}
    for classifier in classifiers:
        for dataset_name, dataset in datasets.items():
            data= np.asarray(list(dataset.take(n)))
            
            for x, y in data:
                classifier.learn_one(x,y)
            
            metric = metrics.Accuracy()
            pred_data = np.asarray(list(dataset.take(m)))
            for x,y in pred_data:
                y_pred = dict_to_highest_class([classifier.predict_proba_one(x)])[0]
                metric = metric.update(y, y_pred)
            
            results[classifier.__class__.__name__ + "_" + dataset_name] = metric
            
            clear_output(wait=True)
            print(classifier.__class__.__name__ + "_" + dataset_name, metric)
            
    return results

## Demos

### Synthetic datasets experiments

In [None]:
#10 numeric features, 5 categoric features, 1000 train, 500 test
datasets = generate_datasets(10,5)

#Parametric t-SNE results
results_tsne_10 = evaluate_tsne_classifiers_datasets(classifiers, datasets, 1000, 500)
results_tsne_10

In [9]:
#Standard results
results_standard_10 = evaluate_classifiers_datasets(classifiers, datasets, 1000, 500)
results_standard_10

LeveragingBaggingClassifier_RandomTree Accuracy: 41.20%


{'ADWINBaggingClassifier_Hyperplane': Accuracy: 49.80%,
 'ADWINBaggingClassifier_RandomRBF': Accuracy: 50.00%,
 'ADWINBaggingClassifier_RandomRBFDrift': Accuracy: 51.00%,
 'ADWINBaggingClassifier_RandomTree': Accuracy: 52.80%,
 'AdaBoostClassifier_Hyperplane': Accuracy: 77.80%,
 'AdaBoostClassifier_RandomRBF': Accuracy: 54.80%,
 'AdaBoostClassifier_RandomRBFDrift': Accuracy: 56.40%,
 'AdaBoostClassifier_RandomTree': Accuracy: 56.80%,
 'AdaptiveRandomForestClassifier_Hyperplane': Accuracy: 70.20%,
 'AdaptiveRandomForestClassifier_RandomRBF': Accuracy: 58.60%,
 'AdaptiveRandomForestClassifier_RandomRBFDrift': Accuracy: 48.00%,
 'AdaptiveRandomForestClassifier_RandomTree': Accuracy: 61.40%,
 'BaggingClassifier_Hyperplane': Accuracy: 52.20%,
 'BaggingClassifier_RandomRBF': Accuracy: 50.40%,
 'BaggingClassifier_RandomRBFDrift': Accuracy: 54.20%,
 'BaggingClassifier_RandomTree': Accuracy: 60.20%,
 'LeveragingBaggingClassifier_Hyperplane': Accuracy: 68.20%,
 'LeveragingBaggingClassifier_Rando

In [10]:
#100 numeric features, 10 categoric features, 500 train, 300 test
datasets = generate_datasets(100,10)

#Parametric t-SNE results
results_tsne_100 = evaluate_tsne_classifiers_datasets(classifiers, datasets, 500, 300)
results_tsne_100

LeveragingBaggingClassifier_RandomTree Accuracy: 64.67%


{'ADWINBaggingClassifier_Hyperplane': Accuracy: 55.67%,
 'ADWINBaggingClassifier_RandomRBF': Accuracy: 50.33%,
 'ADWINBaggingClassifier_RandomRBFDrift': Accuracy: 40.00%,
 'ADWINBaggingClassifier_RandomTree': Accuracy: 45.67%,
 'AdaBoostClassifier_Hyperplane': Accuracy: 52.33%,
 'AdaBoostClassifier_RandomRBF': Accuracy: 41.33%,
 'AdaBoostClassifier_RandomRBFDrift': Accuracy: 42.33%,
 'AdaBoostClassifier_RandomTree': Accuracy: 65.00%,
 'AdaptiveRandomForestClassifier_Hyperplane': Accuracy: 51.67%,
 'AdaptiveRandomForestClassifier_RandomRBF': Accuracy: 43.67%,
 'AdaptiveRandomForestClassifier_RandomRBFDrift': Accuracy: 43.00%,
 'AdaptiveRandomForestClassifier_RandomTree': Accuracy: 54.67%,
 'BaggingClassifier_Hyperplane': Accuracy: 53.33%,
 'BaggingClassifier_RandomRBF': Accuracy: 62.00%,
 'BaggingClassifier_RandomRBFDrift': Accuracy: 45.67%,
 'BaggingClassifier_RandomTree': Accuracy: 78.33%,
 'LeveragingBaggingClassifier_Hyperplane': Accuracy: 46.33%,
 'LeveragingBaggingClassifier_Rando

In [11]:
#Standard results
results_standard_100 = evaluate_classifiers_datasets(classifiers, datasets, 500, 300)
results_standard_100

LeveragingBaggingClassifier_RandomTree Accuracy: 49.33%


{'ADWINBaggingClassifier_Hyperplane': Accuracy: 48.00%,
 'ADWINBaggingClassifier_RandomRBF': Accuracy: 50.67%,
 'ADWINBaggingClassifier_RandomRBFDrift': Accuracy: 58.67%,
 'ADWINBaggingClassifier_RandomTree': Accuracy: 44.67%,
 'AdaBoostClassifier_Hyperplane': Accuracy: 51.00%,
 'AdaBoostClassifier_RandomRBF': Accuracy: 46.67%,
 'AdaBoostClassifier_RandomRBFDrift': Accuracy: 52.67%,
 'AdaBoostClassifier_RandomTree': Accuracy: 32.67%,
 'AdaptiveRandomForestClassifier_Hyperplane': Accuracy: 50.67%,
 'AdaptiveRandomForestClassifier_RandomRBF': Accuracy: 43.00%,
 'AdaptiveRandomForestClassifier_RandomRBFDrift': Accuracy: 36.67%,
 'AdaptiveRandomForestClassifier_RandomTree': Accuracy: 60.00%,
 'BaggingClassifier_Hyperplane': Accuracy: 58.67%,
 'BaggingClassifier_RandomRBF': Accuracy: 63.00%,
 'BaggingClassifier_RandomRBFDrift': Accuracy: 47.33%,
 'BaggingClassifier_RandomTree': Accuracy: 50.33%,
 'LeveragingBaggingClassifier_Hyperplane': Accuracy: 69.00%,
 'LeveragingBaggingClassifier_Rando

### Single batch

In [4]:
tsne = TSNEClassifier(AdaptiveRandomForestClassifier())

dataset= np.asarray(list(datasets.Phishing().take(100)))
dataset_x = np.asarray([np.asarray(list(a[0].values())) for a in dataset])
dataset_y = np.asarray([a[1] for a in dataset])

tsne.fit(dataset_x, dataset_y,batch_size=100)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:22<00:00,  4.36it/s]
  "Even though the tf.config.experimental_run_functions_eagerly "


<__main__.TSNEClassifier at 0x2a13cbefa58>

In [35]:
tsne.predict(dataset_x)

### Multiple batches
First batch: 200; Second batch: 20; Test: 5

In [5]:
tsne = TSNEClassifier(ensemble.AdaptiveRandomForestClassifier())

dataset= np.asarray(list(datasets.Phishing().take(200)))
dataset_x = np.asarray([np.asarray(list(a[0].values())) for a in dataset])
dataset_y = np.asarray([a[1] for a in dataset])

tsne.fit(dataset_x, dataset_y,batch_size=20, n_iter_tsne=50)

dataset= np.asarray(list(datasets.Phishing().take(20)))
dataset_x = np.asarray([np.asarray(list(a[0].values())) for a in dataset])
dataset_y = np.asarray([a[1] for a in dataset])

tsne.fit(dataset_x, dataset_y,batch_size=20, n_iter_tsne=75)


dataset= np.asarray(list(datasets.Phishing().take(5)))
dataset_x = np.asarray([np.asarray(list(a[0].values())) for a in dataset])
dataset_y = np.asarray([a[1] for a in dataset])

tsne.predict(dataset_x)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:15<00:00,  3.16it/s]
  "Even though the tf.config.experimental_run_functions_eagerly "
100%|██████████████████████████████████████████████████████████████████████████████████| 75/75 [00:03<00:00, 22.75it/s]


[True, True, True, False, True]

### Dataset features

In [17]:
for a in datasets.Phishing().take(10):
    print(a)

({'empty_server_form_handler': 0.0, 'popup_window': 0.0, 'https': 0.0, 'request_from_other_domain': 0.0, 'anchor_from_other_domain': 0.0, 'is_popular': 0.5, 'long_url': 1.0, 'age_of_domain': 1, 'ip_in_url': 1}, True)
({'empty_server_form_handler': 1.0, 'popup_window': 0.0, 'https': 0.5, 'request_from_other_domain': 0.5, 'anchor_from_other_domain': 0.0, 'is_popular': 0.5, 'long_url': 0.0, 'age_of_domain': 1, 'ip_in_url': 0}, True)
({'empty_server_form_handler': 0.0, 'popup_window': 0.0, 'https': 1.0, 'request_from_other_domain': 0.0, 'anchor_from_other_domain': 0.5, 'is_popular': 0.5, 'long_url': 0.0, 'age_of_domain': 1, 'ip_in_url': 0}, True)
({'empty_server_form_handler': 0.0, 'popup_window': 0.0, 'https': 1.0, 'request_from_other_domain': 0.0, 'anchor_from_other_domain': 0.0, 'is_popular': 1.0, 'long_url': 0.5, 'age_of_domain': 0, 'ip_in_url': 0}, True)
({'empty_server_form_handler': 1.0, 'popup_window': 0.0, 'https': 0.5, 'request_from_other_domain': 1.0, 'anchor_from_other_domain':