In [1]:
import time

import pandas as pd
from river import (
    ensemble,
    evaluate,
    linear_model,
    metrics,
    multiclass,
    neighbors,
    preprocessing,
    stream,
    tree,
)
from tqdm.auto import tqdm

Columns header to add at the top of the data file:

In [2]:
columns_name = (
    [
        "Elevation",
        "Aspect",
        "Slope",
        "Horizontal_Distance_To_Hydrology",
        "Vertical_Distance_To_Hydrology",
        "Horizontal_Distance_To_Roadways",
        "Hillshade_9am",
        "Hillshade_Noon",
        "Hillshade_3pm",
        "Horizontal_Distance_To_Fire_Points",
    ]
    + [f"Wilderness_Area_{i}" for i in range(1, 5)]
    + [f"Soil_Type_{i}" for i in range(1, 41)]
    + ["Cover_Type"]
)

print(",".join(columns_name))

Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area_1,Wilderness_Area_2,Wilderness_Area_3,Wilderness_Area_4,Soil_Type_1,Soil_Type_2,Soil_Type_3,Soil_Type_4,Soil_Type_5,Soil_Type_6,Soil_Type_7,Soil_Type_8,Soil_Type_9,Soil_Type_10,Soil_Type_11,Soil_Type_12,Soil_Type_13,Soil_Type_14,Soil_Type_15,Soil_Type_16,Soil_Type_17,Soil_Type_18,Soil_Type_19,Soil_Type_20,Soil_Type_21,Soil_Type_22,Soil_Type_23,Soil_Type_24,Soil_Type_25,Soil_Type_26,Soil_Type_27,Soil_Type_28,Soil_Type_29,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39,Soil_Type_40,Cover_Type


In [3]:
N = 50_000
with open("data/covtype.data", "r") as f1:
    with open("data/covtype_small.data", "w") as f2:
        for i in range(N + 1):
            line = next(f1)
            f2.write(line)

In [4]:
# DATAPATH = "data/covtype.data"
# DATAKEY = "covtype"

DATAPATH = "data/covtype_small.data"
DATAKEY = "covtype_small"

In [5]:
with open(DATAPATH) as f:
    N_observations = sum(1 for line in f)
N_observations -= 1
print(f"Number of observations : {N_observations}")

Number of observations : 50000


In [6]:
converters = {column: int for column in columns_name}

dataset = stream.iter_csv(DATAPATH, target="Cover_Type", converters=converters)
x, y = next(iter(dataset))
x, y

({'Elevation': 2596,
  'Aspect': 51,
  'Slope': 3,
  'Horizontal_Distance_To_Hydrology': 258,
  'Vertical_Distance_To_Hydrology': 0,
  'Horizontal_Distance_To_Roadways': 510,
  'Hillshade_9am': 221,
  'Hillshade_Noon': 232,
  'Hillshade_3pm': 148,
  'Horizontal_Distance_To_Fire_Points': 6279,
  'Wilderness_Area_1': 1,
  'Wilderness_Area_2': 0,
  'Wilderness_Area_3': 0,
  'Wilderness_Area_4': 0,
  'Soil_Type_1': 0,
  'Soil_Type_2': 0,
  'Soil_Type_3': 0,
  'Soil_Type_4': 0,
  'Soil_Type_5': 0,
  'Soil_Type_6': 0,
  'Soil_Type_7': 0,
  'Soil_Type_8': 0,
  'Soil_Type_9': 0,
  'Soil_Type_10': 0,
  'Soil_Type_11': 0,
  'Soil_Type_12': 0,
  'Soil_Type_13': 0,
  'Soil_Type_14': 0,
  'Soil_Type_15': 0,
  'Soil_Type_16': 0,
  'Soil_Type_17': 0,
  'Soil_Type_18': 0,
  'Soil_Type_19': 0,
  'Soil_Type_20': 0,
  'Soil_Type_21': 0,
  'Soil_Type_22': 0,
  'Soil_Type_23': 0,
  'Soil_Type_24': 0,
  'Soil_Type_25': 0,
  'Soil_Type_26': 0,
  'Soil_Type_27': 0,
  'Soil_Type_28': 0,
  'Soil_Type_29': 1,
  

In [7]:
dataset = stream.iter_csv(DATAPATH, target="Cover_Type", converters=converters)
cache = stream.Cache()
cache.clear_all()

In [8]:
model = tree.HoeffdingTreeClassifier()
report = metrics.ClassificationReport()

for x, y in tqdm(cache(dataset, key=DATAKEY), total=N_observations):
    y_pred = model.predict_one(x)
    model.learn_one(x, y)
    if y_pred is not None:
        report.update(y, y_pred)

display(report)

  0%|          | 0/50000 [00:00<?, ?it/s]

           Precision   Recall   F1       Support  
                                                  
       1      70.14%   43.78%   53.91%     10151  
       2      81.76%   90.53%   85.92%     28793  
       3      49.35%   38.89%   43.50%      2160  
       4      59.52%   44.72%   51.07%      2160  
       5      44.75%   38.10%   41.15%      2415  
       6      39.55%   42.59%   41.02%      2160  
       7      40.82%   76.99%   53.35%      2160  
                                                  
   Macro      55.13%   53.66%   52.85%            
   Micro      71.64%   71.64%   71.64%            
Weighted      71.66%   71.64%   70.57%            

                 71.64% accuracy                  

In [9]:
model = tree.HoeffdingTreeClassifier()
report = metrics.ClassificationReport()

evaluate.progressive_val_score(cache(dataset, key=DATAKEY), model, report)

           Precision   Recall   F1       Support  
                                                  
       1      70.14%   43.78%   53.91%     10151  
       2      81.76%   90.53%   85.92%     28793  
       3      49.35%   38.89%   43.50%      2160  
       4      59.52%   44.72%   51.07%      2160  
       5      44.75%   38.10%   41.15%      2415  
       6      39.55%   42.59%   41.02%      2160  
       7      40.82%   76.99%   53.35%      2160  
                                                  
   Macro      55.13%   53.66%   52.85%            
   Micro      71.64%   71.64%   71.64%            
Weighted      71.66%   71.64%   70.57%            

                 71.64% accuracy                  

In [10]:
def test_model(model, model_name, df_merge=None):

    report = metrics.ClassificationReport()

    t1 = time.time()
    # evaluate.progressive_val_score(cache(dataset, key="key"), model, report)
    for x, y in tqdm(cache(dataset, key=DATAKEY), total=N_observations):
        y_pred = model.predict_one(x)
        model.learn_one(x, y)
        if y_pred is not None:
            report.update(y, y_pred)
    t2 = time.time()

    df_metrics = [
        {
            "classifier": model_name,
            "accuracy": report._accuracy.get(),
            "precision": report._weighted_precision.get(),
            "recall": report._weighted_recall.get(),
            "f1": report._weighted_f1.get(),
            "time": t2 - t1,
        }
    ]

    df_new = pd.DataFrame(df_metrics).set_index("classifier")

    if df_merge is not None:
        df_merge = pd.concat([df_merge, df_new[~df_new.index.isin(df_merge.index)]])
        df_merge.update(df_new)
        df_new = df_merge

    return df_new


def display_metrics(df_metrics):
    return df_metrics.style.background_gradient(cmap="RdYlGn").background_gradient(
        cmap="RdYlGn_r", subset=["time"]
    ).format(precision=3)

In [11]:
model = tree.HoeffdingTreeClassifier()

df_metrics = test_model(model, "HoeffdingTreeClassifier")
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815


In [12]:
model = ensemble.AdaptiveRandomForestClassifier(seed=0)

df_metrics = test_model(model, "AdaptiveRandomForestClassifier", df_merge=df_metrics)
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815
AdaptiveRandomForestClassifier,0.862,0.861,0.862,0.859,59.609


In [13]:
model = ensemble.AdaBoostClassifier(
    model=(
        tree.HoeffdingTreeClassifier(
            split_criterion="gini", delta=1e-5, grace_period=2000
        )
    ),
    n_models=5,
    seed=0,
)

df_metrics = test_model(model, "AdaBoostClassifier", df_merge=df_metrics)
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815
AdaptiveRandomForestClassifier,0.862,0.861,0.862,0.859,59.609
AdaBoostClassifier,0.763,0.761,0.763,0.757,68.802


In [14]:
model = neighbors.KNNClassifier(window_size=50)

df_metrics = test_model(model, "KNNClassifier_50", df_merge=df_metrics)
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815
AdaptiveRandomForestClassifier,0.862,0.861,0.862,0.859,59.609
AdaBoostClassifier,0.763,0.761,0.763,0.757,68.802
KNNClassifier_50,0.917,0.918,0.917,0.917,34.49


In [15]:
model = neighbors.KNNClassifier(window_size=100)

df_metrics = test_model(model, "KNNClassifier_100", df_merge=df_metrics)
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815
AdaptiveRandomForestClassifier,0.862,0.861,0.862,0.859,59.609
AdaBoostClassifier,0.763,0.761,0.763,0.757,68.802
KNNClassifier_50,0.917,0.918,0.917,0.917,34.49
KNNClassifier_100,0.923,0.924,0.923,0.923,68.696


In [16]:
model = preprocessing.StandardScaler() | multiclass.OneVsOneClassifier(
    linear_model.LogisticRegression()
)

df_metrics = test_model(model, "LogisticRegression_OVO", df_merge=df_metrics)
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815
AdaptiveRandomForestClassifier,0.862,0.861,0.862,0.859,59.609
AdaBoostClassifier,0.763,0.761,0.763,0.757,68.802
KNNClassifier_50,0.917,0.918,0.917,0.917,34.49
KNNClassifier_100,0.923,0.924,0.923,0.923,68.696
LogisticRegression_OVO,0.852,0.85,0.852,0.849,9.902


In [17]:
model = preprocessing.StandardScaler() | multiclass.OneVsRestClassifier(
    linear_model.LogisticRegression()
)

df_metrics = test_model(model, "LogisticRegression_OVR", df_merge=df_metrics)
display_metrics(df_metrics)

  0%|          | 0/50000 [00:00<?, ?it/s]

Unnamed: 0_level_0,accuracy,precision,recall,f1,time
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
HoeffdingTreeClassifier,0.716,0.717,0.716,0.706,14.815
AdaptiveRandomForestClassifier,0.862,0.861,0.862,0.859,59.609
AdaBoostClassifier,0.763,0.761,0.763,0.757,68.802
KNNClassifier_50,0.917,0.918,0.917,0.917,34.49
KNNClassifier_100,0.923,0.924,0.923,0.923,68.696
LogisticRegression_OVO,0.852,0.85,0.852,0.849,9.902
LogisticRegression_OVR,0.851,0.849,0.851,0.849,8.914


In [18]:
cache.clear_all()