# KDD Experiments

## Import Modules and datasets

In [None]:
import numpy as np
import pandas as pd
import json
from streamfs import streamfs

In [None]:
credit_data = pd.read_csv('./datasets/credit.csv')
credit_feature_names = np.array(credit_data.drop('Risk', 1).columns)
credit_data = np.array(credit_data)

har_data = pd.read_csv('./datasets/har_binary.csv')
har_feature_names = np.array(har_data.drop('walking', 1).columns)
har_data = np.array(har_data)

usenet_data = pd.read_csv('./datasets/usenet.csv')
usenet_feature_names = np.array(usenet_data.drop('target', 1).columns)
usenet_data = np.array(usenet_data)

kdd_data = pd.read_csv('./datasets/kddcup.csv')
kdd_feature_names = np.array(kdd_data.drop('target', 1).columns)
kdd_data = np.array(kdd_data)

In [None]:
print("Shape Credit Data: {}".format(credit_data.shape))
print("Shape HAR: {}".format(har_data.shape))
print("Shape Usenet Data: {}".format(usenet_data.shape))
print("Shape KDD Data: {}".format(kdd_data.shape))

In [None]:
# Export target
credit_X, credit_Y = streamfs.prepare_data(credit_data, 0, False)
har_X, har_Y = streamfs.prepare_data(har_data, 0, False)
usenet_X, usenet_Y = streamfs.prepare_data(usenet_data, 0, False)
kdd_X, kdd_Y = streamfs.prepare_data(kdd_data, 0, False)

In [None]:
datasets = dict()

har = dict()
har['X'] = har_X
har['Y'] = har_Y

credit = dict()
credit['X'] = credit_X
credit['Y'] = credit_Y

usenet = dict()
usenet['X'] = usenet_X
usenet['Y'] = usenet_Y

kdd = dict()
kdd['X'] = kdd_X
kdd['Y'] = kdd_Y

datasets['har'] = har
datasets['credit']= credit
datasets['usenet']= usenet
# datasets['kdd']= kdd -> is too big, but does drawing a sample make sense?

## Run FS algorithms

### OFS
Only binary classification -> Credit Score Dataset

In [None]:
param = dict()
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t

ofs_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    Y[Y == 0] = -1  # change 0 to -1, required by ofs

    print(name)

    # different number of features
    for n in [5, 10, 20]:
        param['num_features'] = n    

        # different batch sizes
        for b in [50, 100, 200]:
            param['batch_size'] = b

            print(param)

            _, stats = streamfs.simulate_stream(X, Y, 'ofs', param)

            ofs_results["{};{}F;{}B".format(name,n,b)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'memory_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

In [None]:
with open('./experiment_results/ofs_results.json', 'w') as fp:
    json.dump(ofs_results, fp)

### FSDS
Designed for unsupervised learning, can also handle multilabel problems -> Credit score and HAR

In [None]:
param = dict()
param['b'] = []  # initial sketch matrix
param['ell'] = 0  # initial sketch size
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t


fsds_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    param['k'] = len(np.unique(Y))  # no. of singular values (can be equal to no. of classes)

    print(name)

    # different number of features
    for n in [5, 10, 20]:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for b in [50, 100, 200]:        
            param['batch_size'] = b

            # reset parameters
            param['b'] = []  # initial sketch matrix
            param['ell'] = 0  # initial sketch size

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'fsds', param)

            fsds_results["{};{}F;{}B".format(name,n,b)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'memory_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

In [None]:
with open('./experiment_results/fsds_results.json', 'w') as fp:
    json.dump(fsds_results, fp)

### MCNN
Is extremely slow for HAR dataset -> thus only Credit score

In [None]:
param = dict()
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t

# Original parameters from paper
param['max_n'] = 100  # maximum number of saved instances per cluster
param['e_threshold'] = 3  # error threshold for splitting of a cluster

# Additional parameters
param['max_out_of_var_bound'] = 0.5 # percentage of variables that can at most be outside of variance boundary before new cluster is created
param['p_diff_threshold'] = 50  # threshold of perc. diff. for split/death rate when drift is assumed (_detect_drift())

mcnn_results = dict()

# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    print(name)

    # different number of features
    for n in [5, 10, 20]:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for b in [50, 100, 200]:        
            param['batch_size'] = b

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'mcnn', param)

            mcnn_results["{};{}F;{}B".format(name,n,b)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'memory_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

In [None]:
with open('./experiment_results/mcnn_results.json', 'w') as fp:
    json.dump(mcnn_results, fp)

### CancelOut


In [None]:
param = dict()
param['algorithm'] = 'svm'  # apply SVM classifier to calculate accuracy per time t

canc_results = dict()


# for different data sets
for name, data in datasets.items():
    X = data['X'].copy()
    Y = data['Y'].copy()

    print(name)

    # different number of features
    for n in [5, 10, 20]:
        param['num_features'] = n

        # different batch sizes -> batch size for one iteration, must be at least the same size than k!!
        for b in [50, 100, 200]:        
            param['batch_size'] = b

            print(param)

            w, stats = streamfs.simulate_stream(X, Y, 'nnfs', param)

            canc_results["{};{}F;{}B".format(name,n,b)] = {k: stats[k] for k in ['acc_avg', 'time_avg', 'memory_avg', 'fscr_avg', 'acc_measures', 'features', 'fscr_measures']}

In [None]:
with open('./experiment_results/canc_results.json', 'w') as fp:
    json.dump(canc_results, fp)

In [None]:
# To load data from json
with open('./experiment_results/canc_results.json', 'r') as fp:
    data = json.load(fp)

In [None]:
data