In [None]:
# Required to correctly display interactive (dynamic) plots in Jupyter notebooks.
# This code cell must be run before any other code cell.
%matplotlib notebook

# Generic imports
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec

# `scikit-multiflow` tutorial

This tutorial is divided into the follwing parts:

**PART I.Design and run experiments**
1. Classification task examples
  1. Prequential evaluation implementation
  2. The `EvaluatePrequential` class

2. Concept drift detection
  1. Drift detection test
  2. Impact on predictive performance

3. Regression task examples

**PART II. Extend functionality**
4. Implement a simple estimator
  
---

# PART I. Design and run experiments

## 1. Classification task examples

In this example we will use the `SEA` stream generator. A data generator does not store any data, but generates it on demand.

Next we will setup a learning method (model, estimator, algorithm), in this case the Naive Bayes classifier:

In [None]:
from skmultiflow.data import SEAGenerator
from skmultiflow.bayes import NaiveBayes

stream = SEAGenerator(random_state=1)
classifier = NaiveBayes()

### Prequential evaluation

The prequentail evaluation is easily implemented as a loop:

In [None]:
# Variables to control evaluation loop and track performance
n_samples = 0
correct_cnt = 0
max_samples = 2000

# Prequential evaluation loop
# TODO

print('{} samples analyzed.'.format(n_samples))   
print('NaiveBayes classifier accuracy: {}'.format(correct_cnt / n_samples))

### The `EvaluatePrequential` class

Implements the prequential evaluation method and provides extra functionalities.

Let's run the same experiment on the SEA data but this time we will compare two classifiers:

1. `NaiveBayes`
2. `SGDClassifier`: Linear SVM with SGD training.

We choose the `SGDClassifier` in order to demonstrate the compatibility with incremental methods from `scikit-learn`.

**Note:** `scikit-learn` focuses on the batch learing setting and only a **limited** number of its methods are capable to learn incrementally.

In [None]:
from skmultiflow.evaluation import EvaluatePrequential
from sklearn.linear_model import SGDClassifier

# Setup stream and estimators
stream = SEAGenerator(random_state=1)
nb = NaiveBayes()
svm = SGDClassifier()

# Setup evaluator
# TODO

In [None]:
# Run the evaluation
# TODO

---
## 2. Concept Drift

#### Simulate a data stream with concept drift

For this example, we will generate a synthetic data stream by concatenating 3 distributions of 1000 samples each:
- $dist_a$: $\mu=0.8$, $\sigma=0.05$
- $dist_b$: $\mu=0.4$, $\sigma=0.02$
- $dist_c$: $\mu=0.6$, $\sigma=0.1$.

In [None]:
random_state = np.random.RandomState(12345)
dist_a = random_state.normal(0.8, 0.05, 1000)
dist_b = random_state.normal(0.4, 0.02, 1000)
dist_c = random_state.normal(0.6, 0.1, 1000)

stream = np.concatenate((dist_a, dist_b, dist_c))

# Plot the data
fig = plt.figure(figsize=(7,3), tight_layout=True)
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 
ax1, ax2 = plt.subplot(gs[0]), plt.subplot(gs[1])
ax1.grid()
ax1.plot(stream, label='Stream')
ax2.grid(axis='y')
ax2.hist(dist_a, label=r'$dist_a$')
ax2.hist(dist_b, label=r'$dist_b$')
ax2.hist(dist_c, label=r'$dist_c$')
plt.legend()
plt.show()

### Drift detection test

In this example we will use the ADaptive WINdowing (`ADWIN`) drift detection method.

The goal is to detect that drift has occurred, after samples **1000** and **2000** in the synthetic data stream.

In [None]:
from skmultiflow.drift_detection import ADWIN

# Instantiate the ADWIN drift detector
drift_detector = ADWIN()
drifts = []

# Drift detection test
# TODO

In [None]:
# Plot the data and the detected drifts
fig = plt.figure(figsize=(7,3), tight_layout=True)
gs = gridspec.GridSpec(1, 2, width_ratios=[3, 1]) 
ax1, ax2 = plt.subplot(gs[0]), plt.subplot(gs[1])
ax1.grid()
ax1.plot(stream, label='Stream')
for d in drifts:
    ax1.axvline(d, color='red')
ax2.grid(axis='y')
ax2.hist(dist_a, label=r'$dist_a$')
ax2.hist(dist_b, label=r'$dist_b$')
ax2.hist(dist_c, label=r'$dist_c$')
plt.legend()
plt.show()

### Impact on predictive performance

In this example we will use two popular stream models:

1. The `Hoeffding Tree` is a type of decision tree designed for data streams.
2. The `Hoeffding Adaptive Tree` is an improvement over the original `Hoeffding Tree`.

The `Hoeffding Adaptive Tree` uses `ADWIN` to detect changes, if a change is detected at a given branch, an alternate branch is created and eventually replaces the original branch if it shows better performance on new data.

For this example we will load the data from a csv file using the `FileStream` class.

The data corresponds to the output of the `AGRAWALGenerator` with 3 **gradual drifts** at the 5k, 10k and 15k marks.

#### AGRAWAL data generator

The generator produces a stream containing 9 features, 6 numeric and 3 categorical.
 
There are 10 functions for generating binary class labels from the features. These fnctions determine whether a loan should be approved.

| Feature    | Description            | Values                                                                |
|------------|------------------------|-----------------------------------------------------------------------|
| salary     | the salary             | uniformly   distributed from 20k to 150k                              |
| commission | the commission         | if (salary <   75k) then 0 else uniformly distributed from 10k to 75k |
| age        | the age                | uniformly   distributed from 20 to 80                                 |
| elevel     | the education   level  | uniformly   chosen from 0 to 4                                        |
| car        | car maker              | uniformly   chosen from 1 to 20                                       |
| zipcode    | zip code of the   town | uniformly   chosen from 0 to 8                                        |
| hvalue     | value of the   house   | uniformly   distributed from 50k x zipcode to 100k x zipcode          |
| hyears     | years house   owned    | uniformly   distributed from 1 to 30                                  |
| loan       | total loan   amount    | uniformly   distributed from 0 to 500k                                |

In [None]:
from skmultiflow.data import FileStream
from skmultiflow.trees import HoeffdingTreeClassifier
from skmultiflow.trees import HoeffdingAdaptiveTreeClassifier

# Load data
# TODO
# stream = 
# Setup estimators
# TODO
# cfiers = 

In [None]:
# Setup and run the evaluation
eval = EvaluatePrequential(show_plot=True,
                           metrics=['accuracy', 'kappa', 'model_size'],
                           n_wait=100)
eval.evaluate(stream=stream, model=cfiers, model_names=['HT', 'HAT']);

---
## 3. Regression task examples

### Data sets

|          Dataset         | Samples | Attributes |
|:------------------------:|:-------:|:----------:|
|           bike           |  17389  |     16     |
| metro_interstate_traffic |  48204  |      9     |

#### Bike Sharing Dataset Data Set

Contains the **hourly and daily count of rental bikes** between years 2011 and 2012 in Capital bikeshare system with the corresponding weather and seasonal information. Attributes include weather, temperature, date, time, etc.

|      Attribute      | Description |
|:-------------------:|:-----------------------------------------------------------------|
| instant             | record index
| dteday              |  date
| season              |  season (1:winter, 2:spring, 3:summer, 4:fall)
| yr                  |  year (0: 2011, 1:2012)
| mnth                |  month ( 1 to 12)
| hr                  |  hour (0 to 23)
| holiday             |  weather day is holiday or not (extracted from [Web Link])
| weekday             |  day of the week
| workingday          |  if day is neither weekend nor holiday is 1, otherwise is 0.
| weathersit          | 1: Clear, Few clouds, Partly cloudy, Partly cloudy<br>2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist<br>3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds<br>4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog
| temp                | Normalized temperature in Celsius.
| atemp               | Normalized feeling temperature in Celsius.
| hum                 | Normalized humidity.
| windspeed           | Normalized wind speed.
| casual              | count of casual users
| registered          | count of registered users
| **cnt**             | count of total rental bikes including both casual and registered

In [None]:
bike_stream = FileStream('data/bike.csv')

#### Metro Interstate Traffic Volume Data Set

Hourly Interstate 94 Westbound traffic volume for MN DoT ATR station 301, roughly midway between Minneapolis and St Paul, MN. **Hourly weather features and holidays included for impacts on traffic volume.**

|      Attribute      | Description |
|:-------------------:|:-----------------------------------------------------------------|
| holiday             | US National holidays plus regional holiday, Minnesota State Fair |
| temp                | temp in kelvin                                                   |
| rain_1h             | Amount in mm of rain that occurred in the hour                   |
| snow_1h             | Amount in mm of snow that occurred in the hour                   |
| clouds_all          | Percentage of cloud cover                                        |
| weather_main        | Short textual description of the current weather                 |
| weather_description | Longer textual description of the current weather                |
| date_time           | Hour of the data collected in local CST time                     |
| **traffic_volume**  | Hourly I-94 ATR 301 reported westbound traffic volume            |

In [None]:
traffic_stream = FileStream('data/metro_interstate_traffic.csv')

### Regression models

In this example we will use the regression variants of two popular algorithms:
* kNN Regressor
* Hoeffding Tree Regressor

In [None]:
from skmultiflow.lazy import KNNRegressor
from skmultiflow.trees import HoeffdingTreeRegressor

# Setup estimators
knn_r = KNNRegressor()
ht_r = HoeffdingTreeRegressor(leaf_prediction='mean')
# Setup evaluation
evaluator = EvaluatePrequential(show_plot=True,
                                metrics=['mean_absolute_error', 'mean_square_error', 'running_time' , 'model_size'])

In [None]:
stream = traffic_stream

evaluator.evaluate(stream=stream, model=[knn_r, ht_r], model_names=['kNN-R', 'HT-R'])

# Restart stream and reset models for next experiment
stream.restart()
knn_r.reset()
ht_r.reset();

---
# PART II. Extend functionality

## 4. Implement a simple estimator


A core design element in `scikit-multiflow` is that it should be easy to create new methods or extend existing ones.

In the following example we show how to implement the `MajorityClassClassifier`.

The Majority Class is one of the simplest classifiers: it predicts the class of a new sample to be the most frequent at that point in the stream. It is used mostly as a baseline, but also as a default classifier at the leaves of decision trees.

In [None]:
from skmultiflow.core import BaseSKMObject, ClassifierMixin
from collections import defaultdict

class MajorityClassClassifier(BaseSKMObject, ClassifierMixin):
    def __init__(self):
        # TODO
    
    def partial_fit(self, X, y, classes=None, sample_weight=None):
        # TODO

    def predict(self, X):
        # TODO

    def predict_proba(self, X):
        # TODO

In [None]:
# Setup stream and estimators
stream = SEAGenerator(random_state=1)
nb = NaiveBayes()
mc = MajorityClassClassifier()

# Setup evaluator
eval = EvaluatePrequential(show_plot=True,
                           max_samples=20000,
                           metrics=['accuracy', 'kappa', 'running_time', 'model_size'])

In [None]:
# Run the evaluation
eval.evaluate(stream=stream, model=[nb, mc], model_names=['NB', 'MC']);