In [40]:
import pandas as pd
from utils import plot_drift
from novelty import ephemeral, persistant, periodic

from river import datasets
from river import evaluate
from river import ensemble, linear_model, metrics
from river import preprocessing, utils, optim


from models import ADWINBaggingRegressor


from drift import detect_drift

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [137]:
restaurants = datasets.Restaurants()
list(restaurants.take(2))

[({'store_id': 'air_04341b588bde96cd',
   'date': datetime.datetime(2016, 1, 1, 0, 0),
   'is_holiday': True,
   'genre_name': 'Izakaya',
   'area_name': 'Tōkyō-to Nerima-ku Toyotamakita',
   'latitude': 35.7356234,
   'longitude': 139.6516577},
  10),
 ({'store_id': 'air_05c325d315cc17f5',
   'date': datetime.datetime(2016, 1, 1, 0, 0),
   'is_holiday': True,
   'genre_name': 'Izakaya',
   'area_name': 'Fukuoka-ken Fukuoka-shi Daimyō',
   'latitude': 33.589215700000004,
   'longitude': 130.3928134},
  29)]

In [29]:
def dataset_to_df(dataset, count=None):
    if not count:
        count = len(list(dataset))
    data_list = [{**x, 'value': y} for x, y in dataset.take(count)]
    return pd.DataFrame(data_list)

def df_to_dataset(df):
    df_nonvalue = df.loc[:, df.columns != 'value']
    df_value = df['value']
    return list(zip(df_nonvalue.to_dict(orient='records'), df_value.to_list()))

In [30]:
df = dataset_to_df(restaurants)
df

Unnamed: 0,store_id,date,is_holiday,genre_name,area_name,latitude,longitude,value
0,air_04341b588bde96cd,2016-01-01,True,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,10
1,air_05c325d315cc17f5,2016-01-01,True,Izakaya,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,29
2,air_08ba8cd01b3ba010,2016-01-01,True,Izakaya,Miyagi-ken Sendai-shi Kamisugi,38.269076,140.870403,11
3,air_09a845d5b5944b01,2016-01-01,True,Izakaya,Fukuoka-ken Kurume-shi Jōnanmachi,33.319286,130.508374,56
4,air_1f7f8fa557bc0d55,2016-01-01,True,Bar/Cocktail,Ōsaka-fu Neyagawa-shi Honmachi,34.766093,135.628100,6
...,...,...,...,...,...,...,...,...
252103,air_fea5dc9594450608,2017-04-22,False,Other,Shizuoka-ken Hamamatsu-shi Motoshirochō,34.710895,137.725940,14
252104,air_fee8dcf4d619598e,2017-04-22,False,Italian/French,Hyōgo-ken Kōbe-shi Kumoidōri,34.695124,135.197852,53
252105,air_fef9ccb3ba0da2f7,2017-04-22,False,Japanese food,Hyōgo-ken Himeji-shi Yasuda,34.815149,134.685353,5
252106,air_ffcc2d5087e1b476,2017-04-22,False,Izakaya,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599,1


In [185]:
def eval(dataset):
    overall_metric = metrics.MAE() 
    rolling_metric = utils.Rolling(metrics.MAE(), window_size=1000) #metrics.F1()
    
    model_plain = (
        preprocessing.OrdinalEncoder() |
        preprocessing.StandardScaler() |
        linear_model.LinearRegression(
            optimizer=optim.Adam(lr=3e-4)
        )
    )
    
    overall_plain_result = evaluate.progressive_val_score(
        dataset, 
        model_plain.clone(), 
        overall_metric
    )
    
    rolling_overall_result = evaluate.progressive_val_score(
        dataset, 
        model_plain.clone(), 
        rolling_metric
    )
    
    print(f"plain: {overall_plain_result} | {rolling_overall_result}")


    
    model_Bagging = ensemble.BaggingRegressor(
        model=model_plain.clone(),
        n_models=10,
        seed=42
    )
    overall_Bagging_result = evaluate.progressive_val_score(
        dataset, 
        model_Bagging.clone(), 
        overall_metric
    )

    rolling_Bagging_result = evaluate.progressive_val_score(
        dataset, 
        model_Bagging.clone(), 
        rolling_metric
    )

    print(f"Bagging: {overall_Bagging_result} | {rolling_Bagging_result}")
    
    
    
    model_ADWIN = ADWINBaggingRegressor(
        model=model_plain.clone(),
        n_models=10,
        seed=42
    )
    overall_ADWIN_result = evaluate.progressive_val_score(
        dataset, 
        model_ADWIN.clone(), 
        overall_metric
    )

    rolling_ADWIN_result = evaluate.progressive_val_score(
        dataset, 
        model_ADWIN.clone(), 
        rolling_metric
    )
    
    
    print(f"ADWIN: {overall_ADWIN_result} | {rolling_ADWIN_result}")

In [None]:
restaurants_10k = restaurants.take(10000)

df_ephemeral_value = ephemeral(df[:10000], 'value', 200, 500, lambda x: x + 100)
df_ephemeral_noholiday = ephemeral(df[:10000], 'is_holiday', 200, 500, lambda x: False)
df_ephemeral_nolocation = ephemeral(ephemeral(df[:10000], 'latitude', 200, 500, lambda x: 0), 'longitude', 200, 500, lambda x: 0)

df_persistant_value = persistant(df[:10000], 'value', 200, lambda x: x + 100)
df_persistant_noholiday = persistant(df[:10000], 'is_holiday', 200, lambda x: False)
df_persistant_nolocation = persistant(persistant(df[:10000], 'latitude', 200, lambda x: 0), 'longitude', 200, lambda x: 0)

In [None]:
print("# Pre-Novelty (Overall | Rolling)")
eval(restaurants_10k)

print()
print("# Ephemeral Value (Overall | Rolling)")
eval(df_to_dataset(df_ephemeral_value))

print()
print("# Ephemeral No Holiday (Overall | Rolling)")
eval(df_to_dataset(df_ephemeral_noholiday))

print()
print("# Ephemeral No Location (Overall | Rolling)")
eval(df_to_dataset(df_ephemeral_nolocation))

print()
print("# Persistant Value (Overall | Rolling)")
eval(df_to_dataset(df_persistant_value))

print()
print("# Persistant No Holiday (Overall | Rolling)")
eval(df_to_dataset(df_persistant_noholiday))


print()
print("# Persistent No Location (Overall | Rolling)")
eval(df_to_dataset(df_persistant_nolocation))

# Pre-Novelty (Overall | Rolling)
plain: MAE: 11.904039 | MAE: 0.
Bagging: MAE: 11.904039 | MAE: 0.
ADWIN: MAE: 11.904039 | MAE: 0.

# Ephemeral Value (Overall | Rolling)
plain: MAE: 12.50553 | MAE: 11.693116
Bagging: MAE: 12.499296 | MAE: 11.648891
ADWIN: MAE: 12.475599 | MAE: 11.621305

# Ephemeral No Holiday (Overall | Rolling)
plain: MAE: 11.904039 | MAE: 11.694291
Bagging: MAE: 11.890224 | MAE: 11.649859
ADWIN: MAE: 11.854787 | MAE: 11.621305

# Ephemeral No Location (Overall | Rolling)
plain: MAE: 11.904476 | MAE: 11.696499
Bagging: MAE: 11.890607 | MAE: 11.651172
ADWIN: MAE: 11.855091 | MAE: 11.621305

# Persistant Value (Overall | Rolling)
plain: MAE: 12.168505 | MAE: 11.692848
Bagging: MAE: 12.164587 | MAE: 11.648006
ADWIN: MAE: 12.415807 | MAE: 11.711433

# Persistant No Holiday (Overall | Rolling)
plain: MAE: 11.904079 | MAE: 11.6951
Bagging: MAE: 11.890266 | MAE: 11.650647
ADWIN: MAE: 11.854837 | MAE: 11.622209

# Persistent No Location (Overall | Rolling)
plain: MAE: 11.90