# KDD Experiments

## Import Modules and datasets

In [1]:
import numpy as np
import pandas as pd
import json
from streamfs import streamfs

In [2]:
credit_data = pd.read_csv('./datasets/credit.csv')
credit_feature_names = np.array(credit_data.drop('Risk', 1).columns)
credit_data = np.array(credit_data)

har_data = pd.read_csv('./datasets/har_binary.csv')
har_feature_names = np.array(har_data.drop('walking', 1).columns)
har_data = np.array(har_data)

usenet_data = pd.read_csv('./datasets/usenet.csv')
usenet_feature_names = np.array(usenet_data.drop('target', 1).columns)
usenet_data = np.array(usenet_data)

kdd_data = pd.read_csv('./datasets/kddcup.csv')
kdd_feature_names = np.array(kdd_data.drop('target', 1).columns)
kdd_data = np.array(kdd_data)

In [3]:
print("Shape Credit Data: {}".format(credit_data.shape))
print("Shape HAR: {}".format(har_data.shape))
print("Shape Usenet Data: {}".format(usenet_data.shape))
print("Shape KDD Data: {}".format(kdd_data.shape))

Shape Credit Data: (965, 24)
Shape HAR: (4722, 563)
Shape Usenet Data: (5931, 659)
Shape KDD Data: (250000, 42)


In [4]:
# Export target
credit_X, credit_Y = streamfs.prepare_data(credit_data, 0, False)
har_X, har_Y = streamfs.prepare_data(har_data, 0, False)
usenet_X, usenet_Y = streamfs.prepare_data(usenet_data, 0, False)
kdd_X, kdd_Y = streamfs.prepare_data(kdd_data, 0, False)

In [5]:
datasets = dict()

har = dict()
har['X'] = har_X
har['Y'] = har_Y

credit = dict()
credit['X'] = credit_X
credit['Y'] = credit_Y

usenet = dict()
usenet['X'] = usenet_X
usenet['Y'] = usenet_Y

kdd = dict()
kdd['X'] = kdd_X
kdd['Y'] = kdd_Y

datasets['har'] = har
datasets['credit']= credit
datasets['usenet']= usenet
# datasets['kdd']= kdd -> is too big, but does drawing a sample make sense?

# Batch size and no. of selected features

In [6]:
# no. of selected features
p = [5, 10, 20]

# batch size
b = dict()
b['har'] = [round(har_X.shape[0]*0.1), round(har_X.shape[0]*0.15), round(har_X.shape[0]*0.2)]
b['credit'] = [round(credit_X.shape[0]*0.1), round(credit_X.shape[0]*0.15), round(credit_X.shape[0]*0.2)]
b['usenet'] = [round(usenet_X.shape[0]*0.1), round(usenet_X.shape[0]*0.15), round(usenet_X.shape[0]*0.2)]

In [7]:
b

{'har': [472, 708, 944], 'credit': [96, 145, 193], 'usenet': [593, 890, 1186]}

## Run FS algorithms

### OFS
Only binary classification -> Credit Score Dataset

In [11]:
param = dict()
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t

ofs_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    Y[Y == 0] = -1  # change 0 to -1, required by ofs

    print(name)

    # different number of features
    for n in p:
        param['num_features'] = n
        
        # different batch sizes
        for batch in b[name]:
            param['batch_size'] = batch

            print(param)

            _, stats = streamfs.simulate_stream(X, Y, 'ofs', param)

            ofs_results["{};{}F;{}B".format(name,n,batch)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

har
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 472}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 708}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 944}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 472}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 708}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 944}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 472}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 708}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 944}
credit
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 96}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 145}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 193}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 96}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 145}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 193}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 96}
{'algorithm': 'svm', 'num_features': 2

In [12]:
with open('./experiment_results/ofs_results.json', 'w') as fp:
    json.dump(ofs_results, fp)

### FSDS
Designed for unsupervised learning, can also handle multilabel problems -> Credit score and HAR

In [18]:
param = dict()
param['b'] = []  # initial sketch matrix
param['ell'] = 0  # initial sketch size
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t


fsds_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    param['k'] = len(np.unique(Y))  # no. of singular values (can be equal to no. of classes)

    print(name)

    # different number of features
    for n in p:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for batch in b[name]:        
            param['batch_size'] = batch

            # reset parameters
            param['b'] = []  # initial sketch matrix
            param['ell'] = 0  # initial sketch size

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'fsds', param)

            fsds_results["{};{}F;{}B".format(name,n,batch)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

har
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 5, 'batch_size': 472}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 5, 'batch_size': 708}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 5, 'batch_size': 944}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 10, 'batch_size': 472}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 10, 'batch_size': 708}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 10, 'batch_size': 944}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 20, 'batch_size': 472}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 20, 'batch_size': 708}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 20, 'batch_size': 944}
credit
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 5, 'batch_size': 96}
{'b': [], 'ell': 0, 'algorithm': 'svm', 'k': 2, 'num_features': 5, 'batch_size': 145}
{'b': [], 'ell': 0, 'algorithm': 'svm'

In [20]:
with open('./experiment_results/fsds_results.json', 'w') as fp:
    json.dump(fsds_results, fp)

### MCNN
Is extremely slow for HAR dataset -> thus only Credit score

In [21]:
param = dict()
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t

# Original parameters from paper
param['max_n'] = 100  # maximum number of saved instances per cluster
param['e_threshold'] = 3  # error threshold for splitting of a cluster

# Additional parameters
param['max_out_of_var_bound'] = 0.5 # percentage of variables that can at most be outside of variance boundary before new cluster is created
param['p_diff_threshold'] = 50  # threshold of perc. diff. for split/death rate when drift is assumed (_detect_drift())

mcnn_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    print(name)

    # different number of features
    for n in p:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for batch in b[name]:        
            param['batch_size'] = batch

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'mcnn', param)

            mcnn_results["{};{}F;{}B".format(name,n,batch)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

har
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 5, 'batch_size': 472}
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 5, 'batch_size': 708}
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 5, 'batch_size': 944}
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 10, 'batch_size': 472}
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 10, 'batch_size': 708}
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 10, 'batch_size': 944}
{'algorithm': 'svm', 'max_n': 100, 'e_threshold': 3, 'max_out_of_var_bound': 0.5, 'p_diff_threshold': 50, 'num_features': 20, 'ba

In [22]:
with open('./experiment_results/mcnn_results.json', 'w') as fp:
    json.dump(mcnn_results, fp)

### CancelOut


In [14]:
param = dict()
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t

canc_results = dict()


# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    print(name)

    # different number of features
    for n in p:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for batch in b[name]:        
            param['batch_size'] = batch

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'nnfs', param)

            canc_results["{};{}F;{}B".format(name,n,batch)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'memory_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

har
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 472}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 708}
EarlyStopping counter: 1 out of 3
EarlyStopping counter: 2 out of 3
EarlyStopping counter: 3 out of 3
Early stopping
EarlyStopping counter: 1 out of 3
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 944}
EarlyStopping counter: 1 out of 3
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 472}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 708}
{'algorithm': 'svm', 'num_features': 10, 'batch_size': 944}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 472}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 708}
{'algorithm': 'svm', 'num_features': 20, 'batch_size': 944}
credit
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 96}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 145}
{'algorithm': 'svm', 'num_features': 5, 'batch_size': 193}
EarlyStopping counter: 1 out of 3
EarlyStopping counter: 2 out of 3
EarlyStopping counter: 

In [15]:
with open('./experiment_results/canc_results.json', 'w') as fp:
    json.dump(canc_results, fp)

In [16]:
# To load data from json
with open('./experiment_results/canc_results.json', 'r') as fp:
    data = json.load(fp)

In [17]:
data

{'har;5F;472B': {'acc_avg': 86.99922958397536,
  'time_avg': 2661.965953181819,
  'memory_avg': 8513722.181818182,
  'fscr_avg': 0.74,
  'acc_measures': [0.8728813559322034,
   0.7033898305084746,
   0.836864406779661,
   0.885593220338983,
   0.8601694915254238,
   0.9216101694915254,
   0.8834745762711864,
   0.8919491525423728,
   0.8580508474576272,
   0.8559322033898306,
   1.0],
  'features': [[198, 62, 37, 117, 297],
   [198, 48, 45, 37, 117],
   [297, 561, 198, 199, 37],
   [198, 152, 362, 448, 232],
   [117, 157, 177, 236, 476],
   [159, 297, 65, 66, 199],
   [65, 66, 43, 59, 67],
   [537, 37, 117, 512, 550],
   [37, 117, 157, 297, 159],
   [198, 162, 117, 37, 297],
   [377, 125, 134, 103, 559]],
  'fscr_measures': [0.4, 0.6, 0.8, 1.0, 1.0, 0.6, 1.0, 0.6, 0.4, 1.0]},
 'har;5F;708B': {'acc_avg': 87.23756398075214,
  'time_avg': 5097.424668571437,
  'memory_avg': 575780.5714285715,
  'fscr_avg': 0.7666666666666666,
  'acc_measures': [0.78954802259887,
   0.9067796610169492,
   0