In [2]:
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score

from plot import create_plots
from classify import create_df_features

In [None]:
pathReal = Path.cwd() / 'smart_meters_london_2013.csv'
pathSynth = ...

df_real = pd.read_csv(pathReal, parse_dates = ['timestamp']).set_index('timestamp')
df_synth = pd.read_csv(pathSynth, parse_dates = ['timestamp']).set_index('timestamp')

In [None]:
fig_dict, rmse_dict = create_plots(df_real, df_synth)

In [None]:
score = 0
for item in rmse_dict.values():
    score += item.loc[item['statistic'] != 'median', 'value'].sum()

print(score)

**Classifier**

In [None]:
df_features = create_df_features(df_real, df_synth)

X = df_features.drop('label', axis = 1).astype(float)
y = df_features['label'].astype(int)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
y_train = np.array(y_train)
y_test = np.array(y_test)

model = xgb.XGBClassifier(eval_metric = 'logloss')
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test accuracy: {accuracy:.4f}')

In [None]:
featureImportance = model.feature_importances_
df_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': featureImportance
})
df_importance = df_importance.sort_values(by= 'Importance', ascending = False)
df_importance