# KDD Experiments

## Import Modules and datasets

In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import StandardScaler
from streamfs import streamfs

In [2]:
credit_data = pd.read_csv('./datasets/cleaned_german_credit_score.csv')
credit_feature_names = np.array(credit_data.drop('Risk', 1).columns)
credit_data = np.array(credit_data)

har_data = pd.read_csv('./datasets/human_activity_recognition.csv')
har_feature_names = np.array(har_data.drop('Activity', 1).columns)
har_data = np.array(har_data)

In [3]:
print("Shape Credit Data: {}".format(credit_data.shape))
print("Shape HAR: {}".format(har_data.shape))

Shape Credit Data: (965, 24)
Shape HAR: (10299, 563)


In [4]:
# Export target
credit_X, credit_Y = streamfs.prepare_data(credit_data, 23, False)

har_X, har_Y = streamfs.prepare_data(har_data, 562, False)
har_X = np.array(har_X, dtype='float')
har_Y, _ = pd.factorize(har_Y)
har_Y = np.array(har_Y)

In [5]:
datasets = dict()
har = dict()
har['X'] = har_X
har['Y'] = har_Y
credit = dict()
credit['X'] = credit_X
credit['Y'] = credit_Y

datasets['har'] = har
datasets['credit']= credit

## Run FS algorithms

### OFS
Only binary classification -> Credit Score Dataset

In [6]:
param = dict()
param['algorithm'] = 'knn'  # apply KNN classifier to calculate accuracy per time t
param['neighbors'] = 5  # set n_neighbors for KNN

ofs_results = dict()

'''
# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()
'''
# for one dataset
name = 'credit'
X = datasets[name]['X'].copy()
Y = datasets[name]['Y'].copy()
Y[Y == 0] = -1  # change 0 to -1, required by ofs

print(name)

# different number of features
for n in [5, 10, 20]:
    param['num_features'] = n    
    
    # different batch sizes
    for b in [50, 100, 200]:
        param['batch_size'] = b
        
        print(param)
        
        _, stats = streamfs.simulate_stream(X, Y, 'ofs', param)
        
        ofs_results["Credit;{}F;{}B".format(n,b)] = stats

credit
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 5, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 5, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 5, 'batch_size': 200}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 10, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 10, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 10, 'batch_size': 200}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 20, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 20, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 20, 'batch_size': 200}


### FSDS
Designed for unsupervised learning, can also handle multilabel problems -> Credit score and HAR

In [7]:
param = dict()
param['b'] = []  # initial sketch matrix
param['ell'] = 0  # initial sketch size
param['algorithm'] = 'knn'  # apply KNN classifier to calculate accuracy per time t
param['neighbors'] = 5  # set n_neighbors for KNN

fsds_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    param['k'] = len(np.unique(Y))  # no. of singular values (can be equal to no. of classes)

    print(name)

    # different number of features
    for n in [5, 10, 20]:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for b in [50, 100, 200]:        
            param['batch_size'] = b

            # reset parameters
            param['b'] = []  # initial sketch matrix
            param['ell'] = 0  # initial sketch size

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'fsds', param)

            fsds_results["{};{}F;{}B".format(name,n,b)] = stats

har
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 5, 'batch_size': 50}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 5, 'batch_size': 100}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 5, 'batch_size': 200}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 10, 'batch_size': 50}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 10, 'batch_size': 100}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 10, 'batch_size': 200}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 20, 'batch_size': 50}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 20, 'batch_size': 100}
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 6, 'num_features': 20, 'batch_size': 200}
credit
{'b': [], 'ell': 0, 'algorithm': 'knn', 'neighbors': 5, 'k': 2, 'num

### MCNN
Is extremely slow for HAR dataset -> thus only Credit score

In [8]:
param = dict()
param['algorithm'] = 'knn'  # apply KNN classifier to calculate accuracy per time t
param['neighbors'] = 5  # set n_neighbors for KNN

# Original parameters from paper
param['max_n'] = 100  # maximum number of saved instances per cluster
param['e_threshold'] = 3  # error threshold for splitting of a cluster

# Additional parameters
param['boundary_var_multiplier'] = 2  # multiplier for the var. boundary of the closest centroid (run_mcnn())
param['p_diff_threshold'] = 50  # threshold of perc. diff. for split/death rate when drift is assumed (_detect_drift())

mcnn_results = dict()

'''
# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()
'''
# for one dataset
name = 'credit'
X = datasets[name]['X'].copy()
Y = datasets[name]['Y'].copy()

print(name)

# different number of features
for n in [5, 10, 20]:
    param['num_features'] = n

    # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
    for b in [50, 100, 200]:        
        param['batch_size'] = b

        print(param)

        w, stats = streamfs.simulate_stream(X, Y, 'mcnn', param)

        mcnn_results["{};{}F;{}B".format(name,n,b)] = stats

credit
{'algorithm': 'knn', 'neighbors': 5, 'max_n': 100, 'e_threshold': 3, 'boundary_var_multiplier': 2, 'p_diff_threshold': 50, 'num_features': 5, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'max_n': 100, 'e_threshold': 3, 'boundary_var_multiplier': 2, 'p_diff_threshold': 50, 'num_features': 5, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'max_n': 100, 'e_threshold': 3, 'boundary_var_multiplier': 2, 'p_diff_threshold': 50, 'num_features': 5, 'batch_size': 200}
{'algorithm': 'knn', 'neighbors': 5, 'max_n': 100, 'e_threshold': 3, 'boundary_var_multiplier': 2, 'p_diff_threshold': 50, 'num_features': 10, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'max_n': 100, 'e_threshold': 3, 'boundary_var_multiplier': 2, 'p_diff_threshold': 50, 'num_features': 10, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'max_n': 100, 'e_threshold': 3, 'boundary_var_multiplier': 2, 'p_diff_threshold': 50, 'num_features': 10, 'batch_size': 200}
{'algorithm': 'knn', 'neig

### CancelOut

binary classification -> credit score data

In [11]:
param = dict()
param['algorithm'] = 'knn'  # apply KNN classifier to calculate accuracy per time t
param['neighbors'] = 5  # set n_neighbors for KNN

canc_results = dict()

'''
# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()
'''
# for one dataset
name = 'credit'
X = datasets[name]['X'].copy()
Y = datasets[name]['Y'].copy()

scaler = StandardScaler()
X = scaler.fit_transform(X)

print(name)

# different number of features
for n in [5, 10, 20]:
    param['num_features'] = n

    # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
    for b in [50, 100, 200]:        
        param['batch_size'] = b

        print(param)

        w, stats = streamfs.simulate_stream(X, Y, 'nnfs', param)

        canc_results["{};{}F;{}B".format(name,n,b)] = stats

credit
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 5, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 5, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 5, 'batch_size': 200}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 10, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 10, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 10, 'batch_size': 200}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 20, 'batch_size': 50}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 20, 'batch_size': 100}
{'algorithm': 'knn', 'neighbors': 5, 'num_features': 20, 'batch_size': 200}


### Save results to JSON

In [12]:
with open('./experiment_results/ofs_results.json', 'w') as fp:
    json.dump(ofs_results, fp)

with open('./experiment_results/fsds_results.json', 'w') as fp:
    json.dump(fsds_results, fp)
    
with open('./experiment_results/mcnn_results.json', 'w') as fp:
    json.dump(mcnn_results, fp)

with open('./experiment_results/canc_results.json', 'w') as fp:
    json.dump(canc_results, fp)

In [16]:
# To load data from json
with open('./experiment_results/canc_results.json', 'r') as fp:
    data = json.load(fp)

In [17]:
data

{'credit;5F;50B': {'time_measures': [1.9260037199999829,
   2.007582324000026,
   1.9317811939999956,
   2.0343368730000293,
   2.0952082800000085,
   2.095908424000015,
   2.1262803850000296,
   2.0975945270000125,
   1.9737985749999893,
   1.9815500219999649,
   2.0034159010000394,
   2.011317967000025,
   2.023897940999973,
   2.0302074630000106,
   2.0180526120000195,
   2.134567478000008,
   2.0823645950000014,
   2.0530181759999664,
   2.053342545000021,
   2.0746779180000203],
  'memory_measures': [8192,
   16384,
   24576,
   36864,
   45056,
   53248,
   61440,
   73728,
   81920,
   90112,
   98304,
   110592,
   118784,
   126976,
   135168,
   147456,
   155648,
   163840,
   172032,
   176128],
  'acc_measures': [1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0,
   0.98,
   1.0,
   1.0,
   1.0,
   1.0,
   0.98,
   1.0,
   1.0,
   1.0,
   1.0,
   1.0],
  'features': [[22, 21, 15, 16, 19],
   [21, 22, 16, 20, 14],
   [22, 21, 16, 15, 18],
   [22, 21, 16, 1