In [15]:
from river.datasets import Phishing
import pandas as pd
from utils import plot_drift
from novelty import ephemeral, persistant, periodic

from river import datasets
from river import ensemble
from river import evaluate
from river import linear_model
from river import metrics
from river import optim
from river import preprocessing


from drift import detect_drift

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
phishing = Phishing()
list(phishing.take(2))

[({'empty_server_form_handler': 0.0,
   'popup_window': 0.0,
   'https': 0.0,
   'request_from_other_domain': 0.0,
   'anchor_from_other_domain': 0.0,
   'is_popular': 0.5,
   'long_url': 1.0,
   'age_of_domain': 1,
   'ip_in_url': 1},
  True),
 ({'empty_server_form_handler': 1.0,
   'popup_window': 0.0,
   'https': 0.5,
   'request_from_other_domain': 0.5,
   'anchor_from_other_domain': 0.0,
   'is_popular': 0.5,
   'long_url': 0.0,
   'age_of_domain': 1,
   'ip_in_url': 0},
  True)]

In [17]:
def dataset_to_df(dataset, count=None):
    if not count:
        count = len(list(dataset))
    data_list = [{**x, 'value': y} for x, y in dataset.take(count)]
    return pd.DataFrame(data_list)

def df_to_dataset(df):
    df_nonvalue = df.loc[:, df.columns != 'value']
    df_value = df['value']
    return list(zip(df_nonvalue.to_dict(orient='records'), df_value.to_list()))

In [18]:
df = dataset_to_df(phishing)
df

Unnamed: 0,empty_server_form_handler,popup_window,https,request_from_other_domain,anchor_from_other_domain,is_popular,long_url,age_of_domain,ip_in_url,value
0,0.0,0.0,0.0,0.0,0.0,0.5,1.0,1,1,True
1,1.0,0.0,0.5,0.5,0.0,0.5,0.0,1,0,True
2,0.0,0.0,1.0,0.0,0.5,0.5,0.0,1,0,True
3,0.0,0.0,1.0,0.0,0.0,1.0,0.5,0,0,True
4,1.0,0.0,0.5,1.0,0.0,0.5,0.5,1,0,False
...,...,...,...,...,...,...,...,...,...,...
1245,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1,0,True
1246,0.0,0.5,1.0,0.5,0.0,0.5,0.5,1,0,False
1247,0.0,0.5,0.0,0.0,0.0,0.5,0.0,0,0,True
1248,0.5,0.5,1.0,0.5,0.5,0.5,0.0,1,0,True


In [34]:
def eval(dataset):
    metric = metrics.F1()
    
    model_ADWIN = ensemble.ADWINBaggingClassifier(
        model=(
            preprocessing.StandardScaler() |
            linear_model.LogisticRegression()
        ),
        n_models=10,
        seed=42
    )

    model_plain = (
        preprocessing.StandardScaler() |
        linear_model.LogisticRegression()
    )
    
    ADWIN_result = evaluate.progressive_val_score(
        dataset, 
        model_ADWIN, 
        metric,)
    
    print(f"ADWIN: {ADWIN_result}")
    
    plain_result = evaluate.progressive_val_score(
        dataset, 
        model_plain, 
        metric,)
    
    print(f"plain: {plain_result}")

In [46]:
print("Pre-Novelty Insertion")
eval(phishing)

print()
print("Post-Novelty Insertion")
df_ephemeral = ephemeral(df, 'value', 200, 500, lambda x: not x)
eval(df_to_dataset(df_ephemeral))

Pre-Novelty Insertion
ADWIN: F1: 87.97%
plain: F1: 87.97%

Post-Novelty Insertion
ADWIN: F1: 77.58%
plain: F1: 76.72%


In [38]:
def eval_Ada(dataset):
    model_Ada = ensemble.AdaBoostClassifier(
        model=(
            preprocessing.StandardScaler() |
            linear_model.LogisticRegression()
        ),
        n_models=5,

    )

    model_plain = (
        preprocessing.StandardScaler() |
        linear_model.LogisticRegression()
    )
    
    metric = metrics.F1()
    
    Ada_result = evaluate.progressive_val_score(
        dataset, 
        model_Ada, 
        metric,)
    
    print(f"Ada: {Ada_result}")
    
    
    plain_result = evaluate.progressive_val_score(
        dataset, 
        model_plain, 
        metric,)
    
    print(f"plain: {plain_result}")

In [44]:
print("Pre-Novelty")
eval_Ada(phishing)

print()
print("Post-Novelty Insertion")
eval_Ada(df_to_dataset(df_ephemeral))

Pre-Novelty
Ada: F1: 87.75%
plain: F1: 87.86%

Post-Novelty Insertion
Ada: F1: 86.24%
plain: F1: 80.99%
