# Библиотеки

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from prophet import Prophet
from statsmodels.tsa.seasonal import seasonal_decompose

from src.experiments import ExperimentTracker
from src.metrics import evaluate
from src.models import BaselineYearAgo, aggregated_daily_predictions
from src.plots import plot_prediction, plot_target_boxplots
from src.process_data import prepare_dataset, prepare_parsed_weather, read_datasets

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
plt.rcParams["figure.figsize"] = (5, 5)
plt.rcParams["figure.dpi"] = 100

# Загрузка данных

In [None]:
datasets = read_datasets()

In [None]:
datasets.keys()

In [None]:
tracker = ExperimentTracker()

In [None]:
train = datasets["source_train"].copy()
test = datasets["source_test"].copy()

In [None]:
train

In [None]:
train.info()

In [None]:
train

# EDA

## Смотрим на train и test

In [None]:
train = train.sort_values(["date", "time"])
test = test.sort_values(["date", "time"])

In [None]:
train = train.reset_index().rename(columns={"index": "id"})
train

In [None]:
train["target"].describe()

In [None]:
train = prepare_dataset(train)
test = prepare_dataset(test)

In [None]:
train

In [None]:
# train[train.date == "2020-09-04"][["datetime", "target"]].head(20).plot(x="datetime", y="target")

In [None]:
# train[train.date == "2020-09-05"][["datetime", "target"]].head(20).plot(x="datetime", y="target")

In [None]:
fig = px.line(train.sort_values("datetime"), x="datetime", y="target", hover_data=["date", "time"])
fig.show()

In [None]:
train["date"].min(), train["date"].max()

In [None]:
test["date"].min(), test["date"].max()

In [None]:
train["time"].unique()

In [None]:
train["weather_pred"][train["weather_pred"].fillna("").str.contains("ясн")].unique()

In [None]:
train["weather_pred"].str.contains("ясн").sum()

In [None]:
train["weather_fact"].value_counts()

In [None]:
train['datetime'].dt.year.unique()

In [None]:
plot_target_boxplots(train)

In [None]:
# Из Kaggle, покомпонентое разложение, надо заставить его нормально работать

period = 365

# Multiplicative Decomposition
multiplicative_decomposition = seasonal_decompose(
    train["target"], model="multiplicative", period=period
)

# Additive Decomposition
additive_decomposition = seasonal_decompose(train["target"], model="additive", period=period)

# Plot
# plt.rcParams.update({"figure.figsize": (15, 15), "figure.dpi": 70})
multiplicative_decomposition.plot().suptitle("Multiplicative Decomposition", fontsize=16)
plt.tight_layout(rect=(0, 0.03, 1, 0.95))

additive_decomposition.plot().suptitle("Additive Decomposition", fontsize=16)
plt.tight_layout(rect=(0, 0.03, 1, 0.95))

plt.show()

Ошибка прогноза температуры

In [None]:
print((train['temp_pred'] - train['temp']).describe())
(train['temp_pred'] - train['temp']).hist(bins=50)
plt.title('Ошибка прогноза температуры')
plt.show()

## Стационарность

In [None]:
daily_target = train.groupby(pd.Grouper(key='datetime', freq='D'))['target'].sum()

In [None]:
monthly_target = train.groupby(pd.Grouper(key='datetime', freq='MS'))['target'].sum()

In [None]:
from src.plots import plot_time_series

In [None]:
plot_time_series(daily_target, 30, "Daily")

In [None]:
plot_time_series(monthly_target, 12, "Monthly")

Тест на стационарность

In [None]:
from src.plots import test_stationarity

In [None]:
test_stationarity(daily_target)

## Корреляции

In [None]:
num_features = ['temp', 'temp_pred']
sample = pd.concat([train, test])

In [None]:
sample[num_features + ['target']].corr()

## Погода в данных vs спарсенная 

In [None]:
train.shape

In [None]:
weather = prepare_parsed_weather(datasets['weather_parsed'])
weather

In [None]:
comparison = train.merge(weather, on = 'datetime')

In [None]:
weather[weather.datetime == '2019-01-01 11:00:00']

In [None]:
(comparison['weather_fact'].str.contains('дожд') == comparison['weather_category_parsed'].str.contains('дожд')).value_counts()

In [None]:
train['weather_fact'].value_counts()

In [None]:
comparison

In [None]:
train['datetime']

In [None]:
datasets['weather_parsed']['Cl'].value_counts()

# Baseline

## `target` ровно год назад

In [None]:
train[train.date == "2020-01-01"].head()  # с id 8760 должны появиться предсказания baseline модели

In [None]:
train.shape[0]

In [None]:
test.shape[0]

In [None]:
test.shape[0] / (train.shape[0] + test.shape[0])

In [None]:
train.date.unique().shape[0]

In [None]:
test.date.unique().shape[0]

In [None]:
model = BaselineYearAgo()
model.fit(train.drop("target", axis=1), train["target"])

# Пример невычислимого предсказания
# df = train[train.date >= '2020-12-31'].drop('target', axis=1)
# df['predict'] = model.predict(df)

In [None]:
# df = train[train.date >= '2020-01-01'] # с этой даты baseline модель может выдать прогноз
# df['predict'] = model.predict(df.drop('target', axis=1))
# train_metrics = pd.json_normalize(evaluate(df['target'], df['predict']))
# train_metrics

In [None]:
tracker.add_experiment(
    model,
    train[train.date >= "2020-01-01"],
    test,
    "Baseline: значение год назад относительно текущей даты",
)

In [None]:
train[train.date >= "2020-01-01"].date.unique().shape

In [None]:
df2 = aggregated_daily_predictions(tracker.experiments[0]["train"]).reset_index()
pd.json_normalize(evaluate(df2["target"], df2["predict"]))

In [None]:
plot_prediction(tracker.experiments[0]["train"], "Train: hourly")
plot_prediction(df2, "Train: daily")

In [None]:
pd.json_normalize(evaluate(df2["target"], df2["predict"]))

In [None]:
plot_prediction(tracker.experiments[0]["test"], "Test: hourly")
plot_prediction(df2, "Test: daily")

# Prophet

## Baseline

In [None]:
train = prepare_dataset(datasets["source_train"].copy())
test = prepare_dataset(datasets["source_test"].copy())

In [None]:
prophetBaseline = Prophet()
prophetBaseline.fit(train)

In [None]:
tracker.add_experiment(prophetBaseline, train, test, "Prophet baseline")

In [None]:
forecast = test.merge(prophetBaseline.predict(test), on="ds")
forecast["predict"] = forecast["yhat"]
forecast

In [None]:
pd.json_normalize(evaluate(forecast["target"], forecast["predict"]))

In [None]:
df2 = aggregated_daily_predictions(forecast)
pd.json_normalize(evaluate(df2["target"], df2["predict"]))

## Add regressors: Температура

In [None]:
train.temp_pred.isna().sum()

In [None]:
train["temp_pred"] = train["temp_pred"].ffill()

In [None]:
test["temp_pred"] = test["temp_pred"].ffill()

In [None]:
prophet2 = Prophet()
prophet2.add_regressor("temp")
prophet2.add_regressor("temp_pred")
prophet2.fit(train[["ds", "temp", "temp_pred", "y"]])

In [None]:
tracker.add_experiment(prophet2, train, test, "Prophet with temperature")

In [None]:
# prophet2.plot_components(forecast)

## Add regressors: Температура, праздники и население

In [None]:
train = train.merge(datasets["population"], left_on=train["datetime"].dt.year, right_on="year")
test = test.merge(datasets["population"], left_on=test["datetime"].dt.year, right_on="year")

In [None]:
datasets["population"]

In [None]:
prophet3 = Prophet()
prophet3.add_regressor("temp")
prophet3.add_regressor("temp_pred")
prophet3.add_regressor("population")
# prophet3.add_country_holidays('RU')
prophet3.fit(train[["ds", "temp", "temp_pred", "population", "y"]])

In [None]:
tracker.add_experiment(prophet3, train, test, "Prophet with temperature, holidays and population 2")

In [None]:
tracker.display_metrics()

In [None]:
tracker.get_best_experiment()['model']

In [None]:
tracker.save_to_pickle()

In [None]:
ExperimentTracker.load_experiment().display_metrics()