# Библиотеки

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
from statsmodels.tsa.seasonal import seasonal_decompose
from prophet import Prophet

from src.metrics import evaluate
from src.models import aggregated_daily_predictions, BaselineYearAgo
from src.process_data import read_datasets, prepare_dataset
from src.plots import plot_prediction

# Загрузка данных

In [None]:
datasets = read_datasets()

In [None]:
class ExperimentTracker:
    def __init__(self) -> None:
        self.experiments = []
        self._current_run_id = 0

    def add_experiment(self, model, train, test, name = None, predict_function = None):
        if name == None:
            name = "experiment" + str(self._run_id)
        assert name not in [item['name'] for item in self.experiments], "Name must be unit"

        if isinstance(model, Prophet):
            predict_function = lambda df: model.predict(df)['yhat']
        if predict_function == None:
            predict_function = model.predict
        
        train = train.copy()
        test = test.copy()

        train['predict'] = predict_function(train.drop('target', axis=1))
        test['predict'] = predict_function(test.drop('target', axis=1))

        experiment = {
            'name': name if name != "" else str(model),
            'run_id': self._current_run_id,
            'model': model,
            'train': train.copy(),
            'test': test.copy()
        }

        experiment.update({f"train_{k}": v for k, v in evaluate(train['target'], train['predict']).items()})
        experiment.update({f"test_{k}": v for k, v in evaluate(test['target'], test['predict']).items()})
        # experiment.update(pd.json_normalize(evaluate(test['target'], test['predict'])).add_prefix('test_').iloc[0].to_dict())

        self.experiments.append(experiment)
        self._current_run_id += 1

    def get_experiment(self, name):
        for item in self.experiments:
            if item['name'] == name:
                return item

    def metrics_df(self):
        return pd.json_normalize(self.experiments).drop(['train', 'test'], axis=1)

In [None]:
tracker = ExperimentTracker()

In [None]:
train = datasets['source_train'].copy()
test = datasets['source_test'].copy()

In [None]:
train

In [None]:
train.info()

In [None]:
train

# EDA

In [None]:
train = train.sort_values(['date', 'time'])
test = test.sort_values(['date', 'time'])

In [None]:
train = train.reset_index().rename(columns={'index': 'id'})
train

In [None]:
train['target'].describe()

In [None]:
train = prepare_dataset(train)
test = prepare_dataset(test)

In [None]:
train

In [None]:
train[train.date == '2020-09-04'][['datetime', 'target']]

In [None]:
train[train.date == '2020-09-04'][['datetime', 'target']].head(20).plot(x = "datetime", y = "target")

In [None]:
train[train.date == '2020-09-05'][['datetime', 'target']].head(20).plot(x = "datetime", y = "target")

In [None]:
fig = px.line(train.sort_values('datetime'), x='datetime', y="target", hover_data=["date", "time"])
fig.show()

In [None]:
train['date'].min(), train['date'].max()

In [None]:
test['date'].min(), test['date'].max()

In [None]:
train['time'].unique()

In [None]:
train['weather_pred'].shape

In [None]:
train['weather_pred'].value_counts()

In [None]:
train[train['weather_pred'].isna()]

In [None]:
train['weather_pred'][train['weather_pred'].fillna('').str.contains('ясн')].unique()

In [None]:
train['weather_pred'].str.contains('ясн').sum()

In [None]:
train['weather_fact'].value_counts()

In [None]:
# Из Kaggle, покомпонентое разложение, надо заставить его нормально работать

# Multiplicative Decomposition 
multiplicative_decomposition = seasonal_decompose(train['target'], model='multiplicative', period=30)

# Additive Decomposition
additive_decomposition = seasonal_decompose(train['target'], model='additive', period=30)

# Plot
plt.rcParams.update({'figure.figsize': (15,15), 'figure.dpi': 70})
multiplicative_decomposition.plot().suptitle('Multiplicative Decomposition', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

additive_decomposition.plot().suptitle('Additive Decomposition', fontsize=16)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.show()

# Baseline

## `target` ровно год назад

In [None]:
train[train.date == '2020-01-01'].head() # с id 8760 должны появиться предсказания baseline модели

In [None]:
train.shape[0]

In [None]:
test.shape[0]

In [None]:
test.shape[0] / (train.shape[0] + test.shape[0])

In [None]:
train.date.unique().shape[0]

In [None]:
test.date.unique().shape[0]

In [None]:
model = BaselineYearAgo()
model.fit(train.drop('target', axis=1), train['target'])

# Пример невычислимого предсказания
# df = train[train.date >= '2020-12-31'].drop('target', axis=1)
# df['predict'] = model.predict(df)

In [None]:
# df = train[train.date >= '2020-01-01'] # с этой даты baseline модель может выдать прогноз
# df['predict'] = model.predict(df.drop('target', axis=1))
# train_metrics = pd.json_normalize(evaluate(df['target'], df['predict']))
# train_metrics

In [None]:
tracker.add_experiment(model, train[train.date >= '2020-01-01'], test, "Baseline: значение год назад относительно текущей даты")

In [None]:
train[train.date >= '2020-01-01'].date.unique().shape

In [None]:
df2 = aggregated_daily_predictions(tracker.experiments[0]['train']).reset_index()
pd.json_normalize(evaluate(df2['target'], df2['predict']))

In [None]:
plot_prediction(tracker.experiments[0]['train'], "Train: hourly")
plot_prediction(df2, "Train: daily")

In [None]:
pd.json_normalize(evaluate(df2['target'], df2['predict']))

In [None]:
plot_prediction(tracker.experiments[0]['test'], "Test: hourly")
plot_prediction(df2, "Test: daily")

# Prophet

In [None]:
prophetModel = Prophet()
prophetModel.fit(train)

In [None]:
tracker.add_experiment(prophetModel, train, test, "Prophet")

In [None]:
forecast = test.merge(prophetModel.predict(test), on ='ds')
forecast['predict'] = forecast['yhat']
forecast

In [None]:
pd.json_normalize(evaluate(forecast['target'], forecast['predict']))

In [None]:
df2 = aggregated_daily_predictions(forecast)
pd.json_normalize(evaluate(df2['target'], df2['predict']))

In [None]:
metrics = tracker.metrics_df()

In [None]:
metrics

In [None]:
metrics.style.highlight_min(subset=['test_MAE', 'test_R^2', 'test_MSE', 'test_MAPE', 'test_RMSE'], color = 'green')