### Note:
The current version of this notebook uses the branch `clustream-benchmark` to compare the two versions of `CluStream`: the original version and the version using Welford's algorithm to register the variance of the data points and time stamps.

To install this version locally, switch to this branch and from the root, install locally using 

```
pip install -e ".[dev]"
```

In [1]:
import pandas as pd
import time
from river import cluster
from river import metrics
from river.stream import iter_pandas

In [2]:
data = pd.read_csv("../datasets/agr_a_20k.csv")
features = data.columns[:-2]
stream = iter_pandas(X=data[features], y=data['class'])

clustream_welford = cluster.CluStreamWelford(time_window=10,
                              max_micro_clusters=5,
                              n_macro_clusters=2,
                              time_gap=2,
                              seed=0,
                              halflife=0.5)

clustream_welford_rand = metrics.Rand()

start_1 = time.time()
for sample_cnt_clustream_welford, (x, y_true) in enumerate(stream):
    if sample_cnt_clustream_welford <= 199:
        clustream_welford.learn_one(x)
    else:
        clustream_welford.learn_one(x)
        y_pred = clustream_welford.predict_one(x)
        clustream_welford_rand.update(y_true, y_pred)
end_1 = time.time()
print(end_1 - start_1)
print(clustream_welford._memory_usage)

3.8139219284057617
28.74 KB


In [3]:
data = pd.read_csv("../datasets/agr_a_20k.csv")
features = data.columns[:-2]
stream = iter_pandas(X=data[features], y=data['class'])

clustream_original = cluster.CluStreamOriginal(time_window=10,
                              max_micro_clusters=5,
                              n_macro_clusters=2,
                              time_gap=2,
                              seed=0,
                              halflife=0.5)

clustream_original_rand = metrics.Rand()

start_2 = time.time()
for sample_cnt_clustream_original, (x, y_true) in enumerate(stream):
    if sample_cnt_clustream_original <= 199:
        clustream_original.learn_one(x)
    else:
        clustream_original.learn_one(x)
        y_pred = clustream_original.predict_one(x)
        clustream_original_rand.update(y_true, y_pred)
end_2 = time.time()
print(end_2 - start_2)
print(clustream_original._memory_usage)

2.3557560443878174
17.38 KB


In [4]:
clustream_welford_rand.cm

    0       1      
0   4,250   6,136  
1   4,002   5,412  

In [5]:
clustream_original_rand.cm

    0       1      
0   4,250   6,136  
1   4,002   5,412  

In [6]:
print(clustream_welford_rand)

Rand: 0.5002637313077234


In [7]:
print(clustream_original_rand)

Rand: 0.5002637313077234
