# Initialization

## Import Packages 

In [1]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from scipy.io import arff
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import time
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, roc_auc_score, roc_curve
import os
from skmultiflow.drift_detection.adwin import ADWIN
from sklearn.ensemble import GradientBoostingRegressor
import arff
from tqdm import tqdm
import pixiedust

from copy import deepcopy

Pixiedust database opened successfully


## GBDT main class

In [2]:
class GBDT(object):
    def __init__(self,
                 max_iter=50,
                 sample_rate=0.8,
                 learn_rate=0.01,
                 max_depth=4,
                 new_tree_max_iter=10):

        self.max_iter = max_iter
        self.sample_rate = sample_rate # 0 < sample_rate <= 1
        self.learn_rate = learn_rate
        self.max_depth = max_depth 
        self.dtrees = []
        self.original_f = None
        self.new_tree_max_iter = new_tree_max_iter

    def fit(self, x_train, y_train):

        n, m = x_train.shape
        
        f = np.ones(n) * np.mean(y_train)
        self.original_f = np.mean(y_train)
        self.residual_mean = np.zeros(self.max_iter)
        n_sample = int(n * self.sample_rate)

        for iter_ in range(self.max_iter): 
            sample_idx = np.random.permutation(n)[:n_sample]
            x_train_subset, y_train_subset = x_train[sample_idx, :], y_train[
                sample_idx]
            y_predict_subset = np.zeros(n_sample)
            
            for j in range(n_sample):
                k = sample_idx[j]
                y_predict_subset[j] = f[k]

            residual = y_train_subset - y_predict_subset

            dtree = DecisionTreeRegressor(max_depth=self.max_depth)
            # fit to negative gradient
            dtree.fit(x_train_subset, residual * self.learn_rate)
            self.dtrees.append(dtree)  # append new tree

            # update prediction score
            for j in range(n):
                pre = dtree.predict(np.array([x_train[j]]))
                f[j] += pre

    def predict(self, x):

        n = x.shape[0]
        y = np.zeros([n, len(self.dtrees)])
        
        for iter_ in range(len(self.dtrees)):
            dtree = self.dtrees[iter_]

            y[:, iter_] = dtree.predict(x)

        init_residual = np.ones(y.shape[0]) * self.original_f
        self.cumulated_pred_score = np.cumsum(y, axis=1)
        return np.sum(y, axis=1) + init_residual.reshape(1, -1)
    
    def best_tree_purning(self, y_test):
        init_residual = np.ones(y_test.shape[0]) * self.original_f
        residual = y_test.reshape(1, -1) - init_residual.reshape(1, -1)
        residual_mat = np.repeat(residual, len(self.dtrees), axis=0).T
        tree_purning_residual = np.abs(residual_mat - self.cumulated_pred_score)
        tree_purning_residual = np.mean(tree_purning_residual, axis=0)
        tree_purning_idx = np.argmin(tree_purning_residual)
        self.dtrees = self.dtrees[:tree_purning_idx+1]
        self.max_iter = len(self.dtrees)
        
    def incremental_fit(self, x_test, y_test, pred_score, new_tree_max_iter):
        
        n, m = x_test.shape
        
        f = pred_score
      
        n_sample = int(n*self.sample_rate)
        
        for iter_ in range(new_tree_max_iter):
            
            sample_idx = np.random.permutation(n)[:n_sample]
            
            y_residual = y_test - f
            x_train_subset, residual_train_subset = x_test[sample_idx, :], y_residual[sample_idx]
            
            new_tree = DecisionTreeRegressor(max_depth = self.max_depth)
            new_tree.fit(x_train_subset, residual_train_subset * self.learn_rate)
            self.dtrees.append(new_tree)
            self.max_iter += 1
            
            for j in range(n):
                pre = new_tree.predict(np.array([x_test[j]]))
                f[j] += pre

## Loading .arff Dataset

In [3]:
def load_arff(path, dataset_name, num_copy):
    if num_copy == -1:
        file_path = path + dataset_name + '/'+ dataset_name + '.arff'
        dataset = arff.load(open(file_path), encode_nominal=True)
    else:
        file_path = path + dataset_name + '/'+ dataset_name + str(num_copy) + '.arff'
        dataset = arff.load(open(file_path), encode_nominal=True)
    return np.array(dataset["data"])

# Experiment

## Algorithms

### GBDT Baseline

In [4]:
def evaluation_baseline_GBDT(data, ini_train_size, win_size, **GBDT_parm):

    x_train = data[0:ini_train_size, :-1]
    y_train = data[0:ini_train_size, -1]

    model = GBDT(**GBDT_parm)
    model.fit(x_train, y_train)

    kf = KFold(int((data.shape[0] - ini_train_size) / win_size))
    stream = data[ini_train_size:, :]
    pred = np.zeros(stream.shape[0])
    batch_acc = []
    batch_f1=[]
    
    for train_index, test_index in tqdm(kf.split(stream), total=kf.get_n_splits(), desc="#batch"):
        
        x_test = stream[test_index, :-1]
        y_test = stream[test_index, -1]

        y_residual = model.predict(x_test)
        y_pred = (y_residual >= 0.5)
        
        pred[test_index] = y_pred
        batch_acc.append(metrics.accuracy_score(y_test, y_pred.T))
        batch_f1.append(metrics.f1_score(y_test, y_pred.T,average='macro'))
    
    return batch_acc, batch_f1, pred

### Sliding Window and Retrain GBDT

In [5]:
def evaluation_sliding_GBDT(data, ini_train_size, win_size, **GBDT_parm):

    x_train = data[0:ini_train_size, :-1]
    y_train = data[0:ini_train_size, -1]

    model = GBDT(**GBDT_parm)
    model.fit(x_train, y_train)

    kf = KFold(int((data.shape[0] - ini_train_size) / win_size))
    stream = data[ini_train_size:, :]
    pred = np.zeros(stream.shape[0])
    batch_acc = []
    batch_f1=[]

    
    for train_index, test_index in tqdm(kf.split(stream), total=kf.get_n_splits(), desc="#batch"):

        x_test = stream[test_index, :-1]
        y_test = stream[test_index, -1]

        y_residual = model.predict(x_test)
        y_pred = (y_residual >= 0.5)
    
        pred[test_index] = np.squeeze(y_pred)
        batch_acc.append(metrics.accuracy_score(y_test, y_pred.T))
        batch_f1.append(metrics.f1_score(y_test, y_pred.T,average='macro'))

        
        model = GBDT(**GBDT_parm)
        model.fit(x_test, y_test)

    return batch_acc, batch_f1, pred

### Naive Incremental GBDT (iGBDT)

In [6]:
def evaluation_naive_iGBDT(data, ini_train_size, win_size, num_inc_tree, **GBDT_parm):

    x_train = data[0:ini_train_size, :-1]
    y_train = data[0:ini_train_size, -1]        

    model = GBDT(**GBDT_parm)
    model.fit(x_train, y_train)

    kf = KFold(int((data.shape[0] - ini_train_size) / win_size))
    stream = data[ini_train_size:, :]
    pred = np.zeros(stream.shape[0])
    batch_acc = []
    batch_f1=[]
    
    for train_index, test_index in tqdm(kf.split(stream), total=kf.get_n_splits(), desc="#batch"):

        x_test = stream[test_index, :-1]
        y_test = stream[test_index, -1]

        y_pred_score = model.predict(x_test)
        y_pred_label = (y_pred_score >= 0.5)
        
        batch_acc.append(metrics.accuracy_score(y_test, y_pred_label.T))
        batch_f1.append(metrics.f1_score(y_test, y_pred_label.T,average='macro'))
        
        pred[test_index] = y_pred_label

        y_pred_score = model.predict(x_test)
        y_pred_score = np.squeeze(y_pred_score)
        
        model.incremental_fit(x_test, y_test, y_pred_score, num_inc_tree)
                
        x_train = x_test
        y_train = y_test
    
    return batch_acc, batch_f1, pred

### eGBDT

In [7]:
def evaluation_eGBDT(data, ini_train_size, win_size, max_tree, num_ince_tree, **GBDT_pram):

    x_train = data[0:ini_train_size, :-1]
    y_train = data[0:ini_train_size, -1]
    model = GBDT(**GBDT_pram)
    model.fit(x_train, y_train)

    kf = KFold(int((data.shape[0] - ini_train_size) / win_size))
    stream = data[ini_train_size:, :]
    pred = np.zeros(stream.shape[0])
    accuracy = []
    f1 = []
    prune_tree = []#kun
    tree_before_purning = []
    tree_after_purning =[]

    for train_index, test_index in tqdm(kf.split(stream), total=kf.get_n_splits(), desc="#batch"):

        x_test = stream[test_index, :-1]
        y_test = stream[test_index, -1]
        
        # Step 1. Make Prediction
        y_pred_score = model.predict(x_test)
        y_pred_score = np.squeeze(y_pred_score)
        y_pred_label = (y_pred_score >= 0.5)

        accuracy.append(metrics.accuracy_score(y_test, y_pred_label.T))
        f1.append(metrics.f1_score(y_test,y_pred_label.T,average='macro'))
        
        pred[test_index] = y_pred_label
        
        # Step 2. Purning GBDT
        num_tree_before_purning = len(model.dtrees)
        model.best_tree_purning(y_test)
        num_tree_after_purning = len(model.dtrees)
        #print(test_index[0], 'Purned Num Tree,', num_tree_before_purning - num_tree_after_purning)
        prune_tree.append(num_tree_before_purning - num_tree_after_purning)#kun
        tree_before_purning.append(num_tree_before_purning)
        tree_after_purning.append(num_tree_after_purning)
        
        # Step 3. Update GBDT
        # Step 3.1 Drift Detection, If num_tree < num_base
        if num_tree_after_purning < GBDT_pram['max_iter']:
            model = GBDT(**GBDT_pram)
            model.fit(x_test, y_test)
            GBDT_ensemble_dict = {}
            last_best = 1
        else:
            # Step 3.2 Incremental Update with Fixed Number of Trees
            # This parameter can be ensemble, such as {5, 10, 15 20, 25}
            if len(model.dtrees) <= max_tree:
                y_pred_score = np.squeeze(model.predict(x_test))
                model.incremental_fit(x_test, y_test, y_pred_score, num_ince_tree)
    tqdm.write('Num tree at the end,' + str(len(model.dtrees)))
    
    return accuracy, f1, pred#, prune_tree,tree_before_purning,tree_after_purning #kun

### eGBDT Ensemble

In [8]:
def evaluation_eGBDT_ensemble(data, ini_train_size, win_size, max_tree,
                              max_num_inc_tree, gap_num_inc_tree, **GBDT_pram):

    
    x_train = data[0:ini_train_size, :-1]
    y_train = data[0:ini_train_size, -1]

    eGBDT_dict = {}
    for i in range(15, max_num_inc_tree, gap_num_inc_tree):
        eGBDT_dict[i] = GBDT(**GBDT_pram)
        eGBDT_dict[i].fit(x_train, y_train)

    kf = KFold(int((data.shape[0] - ini_train_size) / win_size))
    stream = data[ini_train_size:, :]
    pred = np.zeros(stream.shape[0])
    accuracy = []
    f1 = []
    

    for train_index, test_index in tqdm(kf.split(stream),
                                        total=kf.get_n_splits(),
                                        desc="#batch"):
        x_test = stream[test_index, :-1]
        y_test = stream[test_index, -1]

        y_pred_score_aver = np.zeros(x_test.shape[0])
        for ie in eGBDT_dict.keys():
            y_pred_score = eGBDT_dict[ie].predict(x_test)
            y_pred_score = np.squeeze(y_pred_score)
            y_pred_score_aver = y_pred_score_aver + y_pred_score
            y_pred_label = (y_pred_score >= 0.5)

        y_pred_score_aver = y_pred_score_aver / len(eGBDT_dict)
        y_pred_label = (y_pred_score_aver >= 0.5)

        accuracy.append(metrics.accuracy_score(y_test, y_pred_label.T))
        f1.append(metrics.f1_score(y_test,y_pred_label.T,average='macro'))
        
        pred[test_index] = y_pred_label

        for ie in eGBDT_dict.keys():

            eGBDT_dict[ie].best_tree_purning(y_test)

            if len(eGBDT_dict[ie].dtrees) < GBDT_pram['max_iter']:
                eGBDT_dict[ie] = GBDT(**GBDT_pram)
                eGBDT_dict[ie].fit(x_test, y_test)
            else:
                if len(eGBDT_dict[ie].dtrees) <= max_tree:
                    y_pred_score = np.squeeze(eGBDT_dict[ie].predict(x_test))
                    eGBDT_dict[ie].incremental_fit(x_test, y_test,
                                                   y_pred_score, ie)
    return accuracy, f1, pred

## Run Real-world Experiment

In [9]:
def exp_realworld(path, dataset_name, num_run, exp_function, **exp_parm):

    aver_total_acc = np.zeros(num_run)
    aver_total_f1 = np.zeros(num_run)

    np.random.seed(0)
    data = load_arff(path, dataset_name, -1)

    num_eval = int(
        (data.shape[0] - exp_parm['ini_train_size']) / exp_parm['win_size'])
    batch_acc = np.zeros([num_run, num_eval])
    batch_f1 = np.zeros([num_run, num_eval])
    
    batch_prune_tree=np.zeros([num_run, num_eval])
    batch_tree_before_purning=np.zeros([num_run, num_eval])
    batch_tree_after_purning=np.zeros([num_run, num_eval])
    
    
    tqdm.write('='*20)
    tqdm.write((dataset_name + str(0)).center(20))
    batch_acc[0], batch_f1[0], pred = exp_function(data, **exp_parm)
    aver_total_acc[0] = metrics.accuracy_score(
        data[exp_parm['ini_train_size']:, -1], pred)
    aver_total_f1[0] = metrics.f1_score(
        data[exp_parm['ini_train_size']:, -1], pred,average='macro')
    tqdm.write('Current r_seed acc,' + str(aver_total_acc[0]))
    tqdm.write('Current r_seed f1,' + str(aver_total_f1[0]))

#save result
    print(pred.shape)
    print(data[exp_parm['ini_train_size']:, -1].shape)
    result = np.zeros([pred.shape[0], 2])
    result[:, 0] = pred
    result[:, 1] = data[exp_parm['ini_train_size']:, -1]
    #np.savetxt(str(dataset_name)+'_iGBDT.out', result , delimiter=',')
    
    

    for r_seed in range(1, num_run):
        np.random.seed(r_seed)
        data = load_arff(path, dataset_name, -1)
        num_eval = int((data.shape[0] - exp_parm['ini_train_size']) /
                       exp_parm['win_size'])
        tqdm.write('='*20)
        tqdm.write((dataset_name + str(r_seed)).center(20))
        batch_acc[r_seed], batch_f1[r_seed], batch_gmean[r_seed],pred = exp_function(data, **exp_parm)
        aver_total_acc[r_seed] = metrics.accuracy_score(
            data[exp_parm['ini_train_size']:, -1], pred)
        tqdm.write('Current r_seed acc,' + str(aver_total_acc[r_seed]))
        aver_total_f1[r_seed] = metrics.f1_score(
            data[exp_parm['ini_train_size']:, -1], pred,average='macro')
        
        tqdm.write('Current r_seed acc,' + str(aver_total_acc[r_seed]))
        tqdm.write('Current r_seed f1,' + str(aver_total_f1[r_seed]))
        
        
    tqdm.write('Average acc,' + str(np.mean(aver_total_acc)))
    tqdm.write('Average f1,' + str(np.mean(aver_total_f1)))
    tqdm.write('Std acc,' + str(np.std(aver_total_acc)))
              
    
    

In [10]:
path = '/data/kunwang/TNNLS/Realworld Data/'
num_run = 1
datasets = [
    'elecNorm', 'airline', 'weather', 'usenet1', 'usenet2',
    'spam_corpus_x2_feature_selected'
]

### elecNorm Setting 

#### eGBDT 

In [36]:
eGBDT_parm = {
    'ini_train_size': 100,
    'win_size': 100,
    'max_tree': 10000,
    'num_ince_tree':20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_parm.update(GBDT_pram)
dataset_name = 'elecNorm'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT,
              **eGBDT_parm)

     elecNorm0      


#batch: 100%|██████████| 452/452 [01:47<00:00,  4.19it/s]

Num tree at the end,558
Current r_seed acc,0.7553746792886844
Current r_seed f1,0.7519610251253868
(45212,)
(45212,)
Average acc,0.7553746792886844
Average f1,0.7519610251253868
Std acc,0.0





#### eGBDT Ensemble

In [11]:
eGBDT_ensemble_parm = {
    'ini_train_size': 100,
    'win_size': 100,
    'max_tree': 1000,
    'max_num_inc_tree': 75, 
    'gap_num_inc_tree': 15
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_ensemble_parm.update(GBDT_pram)
dataset_name = 'elecNorm'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT_ensemble,
              **eGBDT_ensemble_parm)

     elecNorm0      


#batch:   5%|▌         | 23/452 [00:32<10:06,  1.41s/it]


KeyboardInterrupt: 

### weather Setting 

#### eGBDT 

In [37]:
eGBDT_parm = {
    'ini_train_size': 365,
    'win_size': 365,
    'max_tree': 10000,
    'num_ince_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_parm.update(GBDT_pram)
dataset_name = 'weather'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT,
              **eGBDT_parm)

      weather0      


#batch: 100%|██████████| 69/69 [00:43<00:00,  1.58it/s]

Num tree at the end,486
Current r_seed acc,0.7877360357863901
Current r_seed f1,0.7433312990768729
(25261,)
(25261,)
Average acc,0.7877360357863901
Average f1,0.7433312990768729
Std acc,0.0





#### eGBDT Ensemble

In [None]:
eGBDT_ensemble_parm = {
    'ini_train_size': 365,
    'win_size': 365,
    'max_tree': 10000,
    'max_num_inc_tree': 15, 
    'gap_num_inc_tree': 75
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_ensemble_parm.update(GBDT_pram)
dataset_name = 'weather'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT_ensemble,
              **eGBDT_ensemble_parm)

### airline Setting 

#### eGBDT

In [38]:
eGBDT_parm = {
    'ini_train_size': 100,
    'win_size': 100,
    'max_tree': 10000,
    'num_ince_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_parm.update(GBDT_pram)
dataset_name = 'airline'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT,
              **eGBDT_parm)

      airline0      


#batch: 100%|██████████| 5392/5392 [31:00<00:00,  2.90it/s]  


Num tree at the end,200
Current r_seed acc,0.6325417267000814
Current r_seed f1,0.6215690418187166
(539283,)
(539283,)
Average acc,0.6325417267000814
Average f1,0.6215690418187166
Std acc,0.0


#### eGBDT Ensemble

In [None]:
eGBDT_ensemble_parm = {
    'ini_train_size': 100,
    'win_size': 100,
    'max_tree': 10000,
    'max_num_inc_tree': 100, 
    'gap_num_inc_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_ensemble_parm.update(GBDT_pram)
dataset_name = 'airline'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT_ensemble,
              **eGBDT_ensemble_parm)

### Usenet1 Setting

#### eGBDT

In [39]:
eGBDT_parm = {
    'ini_train_size': 40,
    'win_size': 40,
    'max_tree': 10000,
    'num_ince_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_parm.update(GBDT_pram)
dataset_name = 'usenet1'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT,
              **eGBDT_parm)

      usenet10      


#batch: 100%|██████████| 36/36 [00:06<00:00,  5.61it/s]

Num tree at the end,322
Current r_seed acc,0.6787671232876712
Current r_seed f1,0.678048032784804
(1460,)
(1460,)
Average acc,0.6787671232876712
Average f1,0.678048032784804
Std acc,0.0





#### eGBDT Ensemble

In [None]:
eGBDT_ensemble_parm = {
    'ini_train_size': 40,
    'win_size': 40,
    'max_tree': 10000,
    'max_num_inc_tree': 15, 
    'gap_num_inc_tree': 75
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_ensemble_parm.update(GBDT_pram)
dataset_name = 'usenet1'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT_ensemble,
              **eGBDT_ensemble_parm)

### usenet2 Setting 

#### eGBDT 

In [40]:
eGBDT_parm = {
    'ini_train_size': 40,
    'win_size': 40,
    'max_tree': 10000,
    'num_ince_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_parm.update(GBDT_pram)
dataset_name = 'usenet2'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT,
              **eGBDT_parm)

      usenet20      


#batch: 100%|██████████| 36/36 [00:04<00:00,  7.49it/s]

Num tree at the end,423
Current r_seed acc,0.7212328767123287
Current r_seed f1,0.6454481719596057
(1460,)
(1460,)
Average acc,0.7212328767123287
Average f1,0.6454481719596057
Std acc,0.0





#### eGBDT Ensemble 

In [None]:
eGBDT_ensemble_parm = {
    'ini_train_size': 40,
    'win_size': 40,
    'max_tree': 10000,
    'max_num_inc_tree': 100, 
    'gap_num_inc_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_ensemble_parm.update(GBDT_pram)
dataset_name = 'usenet2'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT_ensemble,
              **eGBDT_ensemble_parm)

### spam Setting 

#### eGBDT 

In [41]:
eGBDT_parm = {
    'ini_train_size': 100,
    'win_size': 100,
    'max_tree': 10000,
    'num_ince_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_parm.update(GBDT_pram)
dataset_name = 'spam_corpus_x2_feature_selected'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT,
              **eGBDT_parm)

spam_corpus_x2_feature_selected0


#batch: 100%|██████████| 92/92 [00:40<00:00,  2.29it/s]

Num tree at the end,1127
Current r_seed acc,0.9246530789245446
Current r_seed f1,0.9032680274672158
(9224,)
(9224,)
Average acc,0.9246530789245446
Average f1,0.9032680274672158
Std acc,0.0





#### eGBDT Ensemble 

In [None]:
eGBDT_ensemble_parm = {
    'ini_train_size': 100,
    'win_size': 100,
    'max_tree': 10000,
    'max_num_inc_tree': 100, 
    'gap_num_inc_tree': 20
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

eGBDT_ensemble_parm.update(GBDT_pram)
dataset_name = 'spam_corpus_x2_feature_selected'
exp_realworld(path, dataset_name, num_run, evaluation_eGBDT_ensemble,
              **eGBDT_ensemble_parm)

## GBDT Baseline

### elecNorm

In [11]:
GBDT_base = {
    'ini_train_size': 100,
    'win_size': 100,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_base.update(GBDT_pram)
dataset_name = 'elecNorm'
exp_realworld(path, dataset_name, num_run, evaluation_baseline_GBDT,
              **GBDT_base)

     elecNorm0      


#batch: 100%|██████████| 452/452 [00:05<00:00, 80.34it/s]


Current r_seed acc,0.6512651508449084
Current r_seed f1,0.6512621330136612
(45212,)
(45212,)
Average acc,0.6512651508449084
Average f1,0.6512621330136612
Std acc,0.0


### weather

In [12]:
GBDT_base = {
    'ini_train_size': 365,
    'win_size': 365,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_base.update(GBDT_pram)
dataset_name = 'weather'
exp_realworld(path, dataset_name, num_run, evaluation_baseline_GBDT,
              **GBDT_base)

      weather0      


#batch: 100%|██████████| 69/69 [00:01<00:00, 66.97it/s]


Current r_seed acc,0.7427655278888405
Current r_seed f1,0.6829392195410682
(25261,)
(25261,)
Average acc,0.7427655278888405
Average f1,0.6829392195410682
Std acc,0.0


### airline

In [13]:
GBDT_base = {
    'ini_train_size': 100,
    'win_size': 100,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_base.update(GBDT_pram)
dataset_name = 'airline'
exp_realworld(path, dataset_name, num_run, evaluation_baseline_GBDT,
              **GBDT_base)

      airline0      


#batch: 100%|██████████| 5392/5392 [01:14<00:00, 72.02it/s]


Current r_seed acc,0.5476456702695988
Current r_seed f1,0.4602815321940583
(539283,)
(539283,)
Average acc,0.5476456702695988
Average f1,0.4602815321940583
Std acc,0.0


### usenet1

In [14]:
GBDT_base = {
    'ini_train_size': 40,
    'win_size': 40,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_base.update(GBDT_pram)
dataset_name = 'usenet1'
exp_realworld(path, dataset_name, num_run, evaluation_baseline_GBDT,
              **GBDT_base)

      usenet10      


#batch: 100%|██████████| 36/36 [00:00<00:00, 76.37it/s]

Current r_seed acc,0.576027397260274
Current r_seed f1,0.5248872465445843
(1460,)
(1460,)
Average acc,0.576027397260274
Average f1,0.5248872465445843
Std acc,0.0





### usenet2

In [15]:
GBDT_base = {
    'ini_train_size': 40,
    'win_size': 40,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_base.update(GBDT_pram)
dataset_name = 'usenet2'
exp_realworld(path, dataset_name, num_run, evaluation_baseline_GBDT,
              **GBDT_base)

      usenet20      


#batch: 100%|██████████| 36/36 [00:00<00:00, 76.20it/s]

Current r_seed acc,0.5506849315068493
Current r_seed f1,0.5454959606390087
(1460,)
(1460,)
Average acc,0.5506849315068493
Average f1,0.5454959606390087
Std acc,0.0





### spam

In [16]:
GBDT_base = {
    'ini_train_size': 100,
    'win_size': 100,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_base.update(GBDT_pram)
dataset_name = 'spam_corpus_x2_feature_selected'
exp_realworld(path, dataset_name, num_run, evaluation_baseline_GBDT,
              **GBDT_base)

spam_corpus_x2_feature_selected0


#batch: 100%|██████████| 92/92 [00:02<00:00, 35.45it/s]

Current r_seed acc,0.24794015611448394
Current r_seed f1,0.19867952393362873
(9224,)
(9224,)
Average acc,0.24794015611448394
Average f1,0.19867952393362873
Std acc,0.0





## Sliding GBDT

### elecNorm

In [20]:
GBDT_sliding = {
    'ini_train_size': 100,
    'win_size': 100,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'elecNorm'
exp_realworld(path, dataset_name, num_run, evaluation_sliding_GBDT,
              **GBDT_sliding)

     elecNorm0      


#batch: 100%|██████████| 452/452 [09:37<00:00,  1.28s/it]

Current r_seed acc,0.7771609307263558
Current r_seed f1,0.7705452788928069
(45212,)
(45212,)
Average acc,0.7771609307263558
Average f1,0.7705452788928069
Std acc,0.0





### weather

In [21]:
GBDT_sliding = {
    'ini_train_size': 365,
    'win_size': 365,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'weather'
exp_realworld(path, dataset_name, num_run, evaluation_sliding_GBDT,
              **GBDT_sliding)

      weather0      


#batch: 100%|██████████| 69/69 [05:15<00:00,  4.57s/it]

Current r_seed acc,0.7714263093305886
Current r_seed f1,0.7280053004882026
(25261,)
(25261,)
Average acc,0.7714263093305886
Average f1,0.7280053004882026
Std acc,0.0





### airline

In [25]:
GBDT_sliding = {
    'ini_train_size': 100,
    'win_size': 100,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'airline'
exp_realworld(path, dataset_name, num_run, evaluation_sliding_GBDT,
              **GBDT_sliding)

      airline0      


#batch: 100%|██████████| 5392/5392 [1:55:16<00:00,  1.28s/it]


Current r_seed acc,0.608394850199246
Current r_seed f1,0.6003027347044299
(539283,)
(539283,)
Average acc,0.608394850199246
Average f1,0.6003027347044299
Std acc,0.0


### usenet1

In [22]:
GBDT_sliding = {
    'ini_train_size': 40,
    'win_size': 40,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'usenet1'
exp_realworld(path, dataset_name, num_run, evaluation_sliding_GBDT,
              **GBDT_sliding)

      usenet10      


#batch: 100%|██████████| 36/36 [00:20<00:00,  1.79it/s]

Current r_seed acc,0.7143835616438357
Current r_seed f1,0.7137441997255082
(1460,)
(1460,)
Average acc,0.7143835616438357
Average f1,0.7137441997255082
Std acc,0.0





### usenet2

In [23]:
GBDT_sliding = {
    'ini_train_size': 40,
    'win_size': 40,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'usenet2'
exp_realworld(path, dataset_name, num_run, evaluation_sliding_GBDT,
              **GBDT_sliding)

      usenet20      


#batch: 100%|██████████| 36/36 [00:20<00:00,  1.78it/s]

Current r_seed acc,0.7534246575342466
Current r_seed f1,0.6881408065618593
(1460,)
(1460,)
Average acc,0.7534246575342466
Average f1,0.6881408065618593
Std acc,0.0





### spam

In [24]:
GBDT_sliding = {
    'ini_train_size': 100,
    'win_size': 100,
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'spam_corpus_x2_feature_selected'
exp_realworld(path, dataset_name, num_run, evaluation_sliding_GBDT,
              **GBDT_sliding)

spam_corpus_x2_feature_selected0


#batch: 100%|██████████| 92/92 [02:14<00:00,  1.46s/it]

Current r_seed acc,0.8799869904596704
Current r_seed f1,0.8352914293479075
(9224,)
(9224,)
Average acc,0.8799869904596704
Average f1,0.8352914293479075
Std acc,0.0





## iGBDT

### elecNorm

In [27]:
iGBDT = {
    'ini_train_size': 100,
    'win_size': 100,
    'num_inc_tree': 25
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

iGBDT.update(GBDT_pram)
dataset_name = 'elecNorm'
exp_realworld(path, dataset_name, num_run,evaluation_naive_iGBDT,
              **iGBDT)

     elecNorm0      


#batch: 100%|██████████| 452/452 [06:19<00:00,  1.19it/s]


Current r_seed acc,0.7805228700345042
Current r_seed f1,0.7748526057951237
(45212,)
(45212,)
Average acc,0.7805228700345042
Average f1,0.7748526057951237
Std acc,0.0


### weather

In [28]:
iGBDT = {
    'ini_train_size': 365,
    'win_size': 365,
    'num_inc_tree': 25
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

iGBDT.update(GBDT_pram)
dataset_name = 'weather'
exp_realworld(path, dataset_name, num_run,evaluation_naive_iGBDT,
              **iGBDT)

      weather0      


#batch: 100%|██████████| 69/69 [00:49<00:00,  1.39it/s]

Current r_seed acc,0.7911404932504651
Current r_seed f1,0.7528131703478292
(25261,)
(25261,)
Average acc,0.7911404932504651
Average f1,0.7528131703478292
Std acc,0.0





### airline

In [51]:
iGBDT = {
    'ini_train_size': 100,
    'win_size': 100,
    'num_inc_tree': 25
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

iGBDT.update(GBDT_pram)
dataset_name = 'airline'
exp_realworld(path, dataset_name, num_run,evaluation_naive_iGBDT,
              **iGBDT)

      airline0      


#batch: 100%|██████████| 5392/5392 [12:23:47<00:00,  8.28s/it]  


Current r_seed acc,0.5938811347659763
Current r_seed f1,0.5891225945556153
(539283,)
(539283,)
Average acc,0.5938811347659763
Average f1,0.5891225945556153
Std acc,0.0


### usenet1

In [29]:
iGBDT = {
    'ini_train_size': 40,
    'win_size': 40,
    'num_inc_tree': 25
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

iGBDT.update(GBDT_pram)
dataset_name = 'usenet1'
exp_realworld(path, dataset_name, num_run,evaluation_naive_iGBDT,
              **iGBDT)

      usenet10      


#batch: 100%|██████████| 36/36 [00:05<00:00,  6.77it/s]

Current r_seed acc,0.626027397260274
Current r_seed f1,0.6250112897398845
(1460,)
(1460,)
Average acc,0.626027397260274
Average f1,0.6250112897398845
Std acc,0.0





### usenet2

In [30]:
iGBDT = {
    'ini_train_size': 40,
    'win_size': 40,
    'num_inc_tree': 25
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

iGBDT.update(GBDT_pram)
dataset_name = 'usenet2'
exp_realworld(path, dataset_name, num_run,evaluation_naive_iGBDT,
              **iGBDT)

      usenet20      


#batch: 100%|██████████| 36/36 [00:05<00:00,  6.74it/s]

Current r_seed acc,0.697945205479452
Current r_seed f1,0.6212479065573067
(1460,)
(1460,)
Average acc,0.697945205479452
Average f1,0.6212479065573067
Std acc,0.0





### spam

In [31]:
GBDT_sliding = {
    'ini_train_size': 100,
    'win_size': 100,
    'num_inc_tree': 25
}

GBDT_pram = {
    'max_iter': 200,
    'sample_rate': 0.8,
    'learn_rate': 0.01,
    'max_depth': 4
}

GBDT_sliding.update(GBDT_pram)
dataset_name = 'spam_corpus_x2_feature_selected'
exp_realworld(path, dataset_name, num_run, evaluation_naive_iGBDT,
              **GBDT_sliding)

spam_corpus_x2_feature_selected0


#batch: 100%|██████████| 92/92 [00:51<00:00,  1.79it/s]

Current r_seed acc,0.9297484822202949
Current r_seed f1,0.9089566468578916
(9224,)
(9224,)
Average acc,0.9297484822202949
Average f1,0.9089566468578916
Std acc,0.0



