In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.tree import DecisionTreeClassifier
from collections import Counter, defaultdict
from sklearn.naive_bayes import BernoulliNB
from IPython.display import display

In [44]:
import sys
import os
tornado_path = os.path.abspath('../../tornado')
sys.path.insert(0, tornado_path)
from drift_detection.__init__ import *

In [4]:
random.choice([1,2,3])

2

In [7]:
df = pd.DataFrame({'x': [1,2,3], 'y': [2,4,8]})
df2 = pd.DataFrame({'x':[11,12,13], 'y':[12,14,18]})
df.sample(1)

Unnamed: 0,x,y
1,2,4


In [8]:
df2.loc[1, :] = [0, 100]
df2

Unnamed: 0,x,y
0,11,12
1,0,100
2,13,18


In [41]:
len(np.where(df2['x']%2==1)[0])

2

In [11]:
df2.loc[2, :] = df.sample(1).values
df2

Unnamed: 0,x,y
0,11,12
1,0,100
2,1,2


In [38]:
list(df2['x'].index).filter(lambda x: df2.loc[x, 'x']%1==1)

AttributeError: 'RangeIndex' object has no attribute 'filter'

In [22]:
df3 = pd.DataFrame({'x':[1], 'y':[2], 'z':[3]})
cols = set(df3)
cols -= set(['x', 'y'])
cols

{'z'}

In [24]:
df3[['y']]

Unnamed: 0,y
0,2


In [25]:
df3.sample(frac=1, replace=True)

Unnamed: 0,x,y,z
0,1,2,3


In [34]:
df3.drop(columns=['x'])

Unnamed: 0,y,z
0,2,3


In [48]:
random.choices(list(df3.x), k=3)

[1, 1, 1]

In [28]:
help(random.choice)

Help on method choice in module random:

choice(seq) method of random.Random instance
    Choose a random element from a non-empty sequence.



In [31]:
random.choices([1,2,3,4], k=5)

[4, 3, 1, 3, 2]

In [35]:
help(random.sample)

Help on method sample in module random:

sample(population, k) method of random.Random instance
    Chooses k unique random elements from a population sequence or set.
    
    Returns a new list containing elements from the population while
    leaving the original population unchanged.  The resulting list is
    in selection order so that all sub-slices will also be valid random
    samples.  This allows raffle winners (the sample) to be partitioned
    into grand prize and second place winners (the subslices).
    
    Members of the population need not be hashable or unique.  If the
    population contains repeats, then each occurrence is a possible
    selection in the sample.
    
    To choose a sample in a range of integers, use range as an argument.
    This is especially fast and space efficient for sampling from a
    large population:   sample(range(10000000), 60)



In [130]:
# I am here assuming that there is no 'date' column

# All of these scenario functions return:
#   1. a copy of the data with the scenario applied to it
#   2. the streams which should experience change

# All scenarios should experience a change in the error rate

def shuffle_features(data, classes, shuffle_n=5):
    # pick shuffle_n features columns and shuffle them
    ret = data.copy()
    features = set(data.columns) - set(['label'])
    to_shuffle = random.sample(features, shuffle_n)
    for i in range(shuffle_n):
        ret[to_shuffle[i]] = ret[to_shuffle[(i+1)%shuffle_n]]
    return ret, to_shuffle + [f'label={i}' for i in classes]
    
def not_feature(data, classes):
    # randomly choose a feature x and replace values with not x
    ret = data.copy()
    features = set(data.columns) - set(['label'])
    invert_col = random.choice(list(features))
    ret[invert_col] = [ not i for i in ret[invert_col] ]
    return ret, [invert_col] + [f'label={i}' for i in classes]

def swap_classes(data, classes):
    # pick two classes and replace half of instances of class 1 with instances of class 2
    ret = data.copy()
    active_classes = list(set(data['label']))
    incr_class, decr_class = random.sample(active_classes, 2)
    replace_with = data[ data['label'] == incr_class ]
    to_replace = np.where(data['label'] == decr_class )[0] # output is wrapped in a tuple for some reason
    to_replace = list(to_replace)
    replace_n = len(to_replace) // 2
    to_replace = random.sample(to_replace, replace_n)
    for replace_i in to_replace:
        replacer = replace_with.sample(1).values
        for j in range(len(ret.columns)):
            ret.iloc[replace_i, j] = replacer[0][j]
#         ret.iloc[replace_i, :] = 
    features = set(data.columns) - set(classes)
    return ret, [f'label={i}' for i in [decr_class, incr_class] ] + list(features) # class rates should change plus all feature rates

def new_concept(data, classes):
    # drop classes columns
    no_classes = data.drop(columns=['label'])
    # train new concept
    dtree = DecisionTreeClassifier()
    n_subsamples = 10
    dtree.fit(no_classes.sample(n_subsamples), random.choices(list(data['label']), k=n_subsamples))
    # predict labels
    labels = dtree.predict(no_classes)
    # reset classes
    ret = data.copy()
    ret['label'] = labels
    return ret, [f'label={i}' for i in classes] + ['err'] # dtree

scenarios = 'shuffle_features not_feature swap_classes new_concept'.split()

def add_made_up_labels(no_classes, classes):
    # NB: no_classes is NOT number of classes. It's a df without any classes/labels
    # train new concept
    dtree = DecisionTreeClassifier()
    n_subsamples = 10
    dtree.fit(no_classes.sample(n_subsamples), random.choices(classes, k=n_subsamples))
    # predict labels
    labels = dtree.predict(no_classes)
    # reset classes
    ret = no_classes.copy()
    ret['label'] = labels
    return ret

def detect_stream_drift(stream, detector):
    # NB stream should be a list
    for i in range(len(stream)):
        _, drift_status = detector.run(stream[i])
        if drift_status == True:
            return i
    return False

def detect_drifting_streams(data, detector_algorithm, delta):
    # returns dictionary mapping column name -> point of drift detection
    # for all the columns in which drift was detected
    ret = {}
    for col in data.columns:
        detector = detector_algorithm(delta=delta)
        stream = list(data[col])
        detection = detect_stream_drift(stream, detector)
        ret[col] = detection
#         if detection:
#             ret[col] = detection
    return ret

DRIFT_POINT = 500

def evaluate_detector(features, labels, err, classes, detector_algorithm, delta, drifting_streams):
    # drifting streams is the correct labels of which are drifting
    
    detections = {}
    
    # err
    err_df = pd.DataFrame({'err': err})
    detections['err'] = detect_drifting_streams(err_df, detector_algorithm, delta)
    
    # labels
    labels = list(labels)
    label_df = pd.DataFrame({f'label={class_}': [False for i in range(len(labels))] for class_ in classes})
    for i in range(len(labels)):
        label_df.loc[i, f'label={labels[i]}'] = True
    detections['label'] = detect_drifting_streams(label_df, detector_algorithm, delta)
    
    # features
    detections['feature'] = detect_drifting_streams(features, detector_algorithm, delta)
    
    metrics = defaultdict(Counter)
    for thing in ['err', 'label', 'feature']:
        det = detections[thing]
        metrics[thing]['delay'] = []
        for col in det.keys():
            col_det = det[col]
            if col_det == False:
                if col in drifting_streams:
                    metrics[thing]['fn'] += 1
                else:
                    metrics[thing]['tn'] += 1
            elif col_det and col_det < DRIFT_POINT:
                metrics[thing]['fp'] += 1
            else:
                if col in drifting_streams:
                    metrics[thing]['tp'] += 1
                    metrics[thing]['delay'].append(col_det)
                else:
                    metrics[thing]['fp'] += 1
    return metrics, detections
    

def run_experiment(train, test, classes, learner, scenario, detector_algorithm, delta):
    # train and test should initially have a single 'label' column
    
    # train model
    model = learner()
    model.fit(train.drop(columns=['label']), train['label'])
    
    # generate test dataset
    split_at = len(test) // 2
    test1 = test.loc[:split_at, :]
    test2 = test.loc[split_at:, :]
    scenario_f = eval(scenario)
    test2, changing_streams = scenario_f(test2, classes)
    changing_streams += ['err']
    test = pd.concat([test1, test2])
    y_hat = model.predict(test.drop(columns=['label']))
    y = test['label']
    errs = [ i!=j for i,j in zip(y, y_hat) ]
    
    # run drift detector
    no_comparisons = len(classes) + len(train.columns) # minus one for 'label' and plus one for 'err'
    # delta = delta / no_comparisons # bonferonni correction
    metrics, detections = evaluate_detector(test.drop(columns=['label']), y, errs, classes, detector_algorithm, delta, changing_streams)
    
    return metrics, detections

In [131]:
data = pd.read_csv('synthetic.csv').drop(columns='date')
classes = [1,2,3,4,5]
data = add_made_up_labels(data, classes)
test = data.loc[500:1500, :]
train = data.loc[:500, :]
detector_algorithm = ADWINChangeDetector
delta = 0.5
learner = BernoulliNB

for scenario in scenarios:
    print(scenario)
    metrics, detections = run_experiment(train, test, classes, learner, scenario, detector_algorithm, delta)
    display(metrics)

shuffle_features


defaultdict(collections.Counter,
            {'err': Counter({'delay': [], 'fn': 1}),
             'label': Counter({'delay': [543], 'fn': 4, 'tp': 1}),
             'feature': Counter({'delay': [], 'tn': 98, 'fp': 10, 'fn': 5})})

not_feature


defaultdict(collections.Counter,
            {'err': Counter({'delay': [], 'fn': 1}),
             'label': Counter({'delay': [543], 'fn': 4, 'tp': 1}),
             'feature': Counter({'delay': [863],
                      'tn': 102,
                      'fp': 10,
                      'tp': 1})})

swap_classes


defaultdict(collections.Counter,
            {'err': Counter({'delay': [], 'fn': 1}),
             'label': Counter({'delay': [], 'fp': 2, 'tn': 2, 'fn': 1}),
             'feature': Counter({'delay': [703, 639, 639, 543, 895],
                      'fn': 99,
                      'tp': 5,
                      'fp': 9})})

new_concept


defaultdict(collections.Counter,
            {'err': Counter({'delay': [703], 'tp': 1}),
             'label': Counter({'delay': [], 'fn': 5}),
             'feature': Counter({'delay': [], 'tn': 103, 'fp': 10})})

In [None]:
# I am here assuming that there is no 'date' column

def shuffle_features(data, classes, shuffle_n=5):
    # pick shuffle_n features columns and shuffle them
    ret = data.copy()
    features = set(data.columns) - set(classes)
    to_shuffle = random.sample(features, shuffle_n)
    for i in range(shuffle_n):
        ret[to_shuffle[i]] = re[to_shuffle[(i+1)%shuffle_n]]
    return ret, to_shuffle
    
def not_feature(data, classes):
    # randomly choose a feature x and replace values with not x
    ret = data.copy()
    features = set(data.columns) - set(classes)
    invert_col = random.choice(features)
    ret[invert_col] = [ not i for i in ret[invert_col] ]
    return ret, invert_col

def incr_class_rate(data, classes):
    # pick a class and double its rate
    # TODO: currently actually converts half of non-x's to x's
    ret = data.copy()
    incr_class = random.choice(classes)
    replacements = data[ data[incr_class] == True ]
    for i in range(len(data)):
        if not ret.loc[i, incr_class]:
            if random.random() < 0.5:
                # with 50% probability replace this class with incr_class
                ret.loc[i, :] = replacements.sample(1).values
    return ret, incr_class
                
def decr_class_rate(data, classes):
    # pick a class and half its rate
    # TODO
    ret = data.copy()
    decr_class = random.choice(classes)
    replacements = data[ data[decr_class] == False ]
    for i in range(len(data)):
        if ret.loc[i, decr_class]:
            if random.random() < 0.5:
                # with 50% probability replace this class with incr_class
                ret.loc[i, :] = replacements.sample(1).values
    return ret, decr_class

def new_concept(data, classes):
    # drop classes columns
    no_classes = data[ [ col for col in data.columns if col not in classes ] ]
    # train new concept
    dtree = DecisionTreeClassifier()
    n_subsamples = 10
    dtree.fit(no_classes.sample(n_subsamples), random.choices(range(len(classes)), k=n_subsamples))
    # predict labels
    labels = dtree.predict(no_classes)
    # reset classes
    ret = data.copy()
    for class_ in classes:
        ret[class_] = [ False for i in range(len(data[class_])) ]
    # re-assign classes
    for i in range(len(ret)):
        ret.loc[i, classes[labels[i]]] = True
    return ret, dtree

scenarios = 'shuffle_features not_feature incr_class_rate decr_class_rate new_concept'.split()

def run_experiment(train, test, classes, learner, scenario, detector):
    # train and test should initially have a single 'label' column
    model = learner()
    model.fit(train[[ col for col in train.columns if col != 'label' ]], train['label'])
    
    # split 
                
        
                

In [112]:
def print_dtree(dtree):
    dot_data = StringIO()
    export_graphviz(dtree, out_file=dot_data,  
                    filled=True, rounded=True,
                    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    
    display(Markdown(f'### Concept {i}'))
    display((Image(graph.create_png())))

    split_features = [ i for i in dtree.tree_.feature if i >= 0 ]
    print(f'Split features: {list(dataset.columns[split_features])}')