# Библиотеки

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from prophet import Prophet
from statsmodels.tsa.seasonal import seasonal_decompose

from src.experiments import ExperimentTracker
from src.metrics import evaluate
from src.models import BaselineYearAgo, aggregated_daily_predictions
from src.plots import plot_prediction, plot_target_boxplots
from src.process_data import prepare_dataset, prepare_parsed_weather, prepare_holidays, read_datasets

In [None]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
plt.rcParams["figure.figsize"] = (5, 5)
plt.rcParams["figure.dpi"] = 100

# Загрузка данных

In [None]:
datasets = read_datasets()

In [None]:
datasets.keys()

In [None]:
tracker = ExperimentTracker()

In [None]:
train = datasets["source_train"].copy()
test = datasets["source_test"].copy()

In [None]:
from src.process_data import prepare_holidays

# EDA

## Смотрим на train и test

In [None]:
datasets['source_full']

In [None]:
prepared_dataset = prepare_dataset(datasets["source_full"])
# prepared_dataset.isna().sum()
prepared_dataset

In [None]:
# weather_parsed  = prepare_parsed_weather(datasets['weather_parsed']).set_index('datetime')
# data_leak_columns = weather_parsed.columns.to_list()
# prepared_dataset.loc[:24, [col + '_yesterday' for col in data_leak_columns]].bfill()

In [None]:
datasets["prepared_train"] = prepared_dataset[prepared_dataset["is_train"]].drop("is_train", axis=1)
datasets["prepared_test"] = prepared_dataset[~prepared_dataset["is_train"]].drop("is_train", axis=1)

In [None]:
train = datasets["prepared_train"]
test = datasets["prepared_test"]

In [None]:
train.head(24)

In [None]:
prepare_parsed_weather(datasets['weather_parsed'])

In [None]:
# train[train.date == "2020-09-04"][["datetime", "target"]].head(20).plot(x="datetime", y="target")

In [None]:
# train[train.date == "2020-09-05"][["datetime", "target"]].head(20).plot(x="datetime", y="target")

In [None]:
fig = px.line(train.sort_values("datetime"), x="datetime", y="target", hover_data=["date", "time"])
fig.show()

In [None]:
train["date"].min(), train["date"].max()

In [None]:
test["date"].min(), test["date"].max()

In [None]:
train["time"].unique()

In [None]:
plot_target_boxplots(train)

In [None]:
# Из Kaggle, покомпонентое разложение, надо заставить его нормально работать

period = 365

# Multiplicative Decomposition
multiplicative_decomposition = seasonal_decompose(
    train["target"], model="multiplicative", period=period
)

# Additive Decomposition
additive_decomposition = seasonal_decompose(train["target"], model="additive", period=period)

# Plot
# plt.rcParams.update({"figure.figsize": (15, 15), "figure.dpi": 70})
multiplicative_decomposition.plot().suptitle("Multiplicative Decomposition", fontsize=16)
plt.tight_layout(rect=(0, 0.03, 1, 0.95))

additive_decomposition.plot().suptitle("Additive Decomposition", fontsize=16)
plt.tight_layout(rect=(0, 0.03, 1, 0.95))

plt.show()

Ошибка прогноза температуры

In [None]:
print((train['temp_pred'] - train['temp']).describe())
(train['temp_pred'] - train['temp']).hist(bins=50)
plt.title('Ошибка прогноза температуры')
plt.show()

## Стационарность

In [None]:
daily_target = train.groupby(pd.Grouper(key='datetime', freq='D'))['target'].sum()

In [None]:
monthly_target = train.groupby(pd.Grouper(key='datetime', freq='MS'))['target'].sum()

In [None]:
from src.plots import plot_time_series

In [None]:
plot_time_series(daily_target, 30, "Daily")

In [None]:
plot_time_series(monthly_target, 12, "Monthly")

Тест на стационарность

In [None]:
from src.plots import test_stationarity

In [None]:
test_stationarity(train.set_index('datetime')['target'])

In [None]:
test_stationarity(daily_target)

In [None]:
test_stationarity(monthly_target)

## Корреляции

In [None]:
train = datasets["prepared_train"]
test = datasets["prepared_test"]

In [None]:
sample = pd.concat([train, test]).drop(['wind_direction'], axis=1)

In [None]:
px.imshow(pd.get_dummies(sample.set_index('datetime').drop(['date', 'ds'], axis=1)).replace({False: 0, True: 1}).corr(), height=1000, width = 1500, range_color=(-1, 1))

In [None]:
px.scatter(train, x = "wind_speed_yesterday", y = "target", width=800, height=800)

# Модели

## Baseline

### `target` ровно год назад

In [None]:
train[train.date == "2020-01-01"].head()  # с id 8760 должны появиться предсказания baseline модели

In [None]:
train.shape[0]

In [None]:
test.shape[0]

In [None]:
test.shape[0] / (train.shape[0] + test.shape[0])

In [None]:
train.date.unique().shape[0]

In [None]:
test.date.unique().shape[0]

In [None]:
model = BaselineYearAgo()
model.fit(train.drop("target", axis=1), train["target"])

# Пример невычислимого предсказания
# df = train[train.date >= '2020-12-31'].drop('target', axis=1)
# df['predict'] = model.predict(df)

In [None]:
# df = train[train.date >= '2020-01-01'] # с этой даты baseline модель может выдать прогноз
# df['predict'] = model.predict(df.drop('target', axis=1))
# train_metrics = pd.json_normalize(evaluate(df['target'], df['predict']))
# train_metrics

In [None]:
tracker.add_experiment(
    model,
    train[train.date >= "2020-01-01"],
    test,
    "Baseline: значение год назад относительно текущей даты",
)

In [None]:
train[train.date >= "2020-01-01"].date.unique().shape

In [None]:
df2 = aggregated_daily_predictions(tracker.experiments[0]["train"]).reset_index()
pd.json_normalize(evaluate(df2["target"], df2["predict"]))

In [None]:
plot_prediction(tracker.experiments[0]["train"], "Train: hourly")
plot_prediction(df2, "Train: daily")

In [None]:
pd.json_normalize(evaluate(df2["target"], df2["predict"]))

In [None]:
plot_prediction(tracker.experiments[0]["test"], "Test: hourly")
plot_prediction(df2, "Test: daily")

## Prophet

In [None]:
train = datasets["prepared_train"]
test = datasets["prepared_test"]

In [None]:
prophetBaseline = Prophet()
prophetBaseline.fit(train)

In [None]:
prophetBaseline.predict(test.drop('target', axis=1))['yhat']

In [None]:
tracker.add_experiment(prophetBaseline, train, test, "Prophet baseline")

In [None]:
tracker.display_metrics()

In [None]:
forecast = test.merge(prophetBaseline.predict(test), on="ds")
forecast["predict"] = forecast["yhat"]
forecast

### Add regressors: Температура

In [None]:
train.temp_pred.isna().sum()

In [None]:
train["temp_pred"] = train["temp_pred"].ffill()

In [None]:
test["temp_pred"] = test["temp_pred"].ffill()

In [None]:
prophet2 = Prophet()
prophet2.add_regressor("temp")
prophet2.add_regressor("temp_pred")
prophet2.fit(train[["ds", "temp", "temp_pred", "y"]])

In [None]:
tracker.add_experiment(prophet2, train, test, "Prophet with temperature")

In [None]:
# prophet2.plot_components(forecast)

### Add regressors: Температура, праздники и население

In [None]:
train = datasets["prepared_train"]
test = datasets["prepared_test"]

In [None]:
train = pd.get_dummies(train.drop('date', axis=1))
test = pd.get_dummies(test.drop('date', axis=1))

In [None]:
test = test.reindex(columns = train.columns).fillna(False)

In [None]:
holidays2 = datasets["holidays"]
holidays2['day'] = pd.to_datetime(holidays2['day'])
holidays2 = holidays2.rename(columns={"day": "ds"})
holidays2 = holidays2[holidays2["type"] == 1][["ds", "holiday"]].bfill().dropna()
holidays2.head()

In [None]:
# train.columns[train.columns.str.contains('yesterday')]

In [None]:
prophet3 = Prophet(holidays = holidays2)
prophet3.add_regressor("temp")
prophet3.add_regressor("temp_pred")
prophet3.add_regressor("population")
prophet3.add_regressor("is_weekend")
prophet3.add_regressor("atm_pressure_yesterday")
prophet3.add_regressor("humidity_yesterday")
prophet3.add_regressor("wind_speed_yesterday")
for col in train.columns[train.columns.str.contains('precipitation_pred')]:
    prophet3.add_regressor(col)

prophet3.fit(train)

In [None]:
tracker.add_experiment(prophet3, train, test, "Prophet with temperature, holidays and population 2")

In [None]:
tracker.display_metrics()

# ARIMA

In [None]:
train = datasets["prepared_train"]
test = datasets["prepared_test"]

In [None]:
train = pd.get_dummies(train.drop('date', axis=1)).set_index('datetime')
test = pd.get_dummies(test.drop('date', axis=1)).set_index('datetime')

In [None]:
# train[train.season == 4]['target'].hist(bins=50)

In [None]:
# train['target'].hist(bins=50)

In [None]:
# from statsmodels.graphics.tsaplots import plot_acf

# plot_acf(train.set_index('datetime')['target'])

In [None]:
# from statsmodels.graphics.tsaplots import plot_acf

# plot_acf(train.set_index('datetime')['target'])

In [None]:
from pmdarima.arima import auto_arima   

In [None]:
import pmdarima
from pmdarima import pipeline
from pmdarima import preprocessing as ppc

In [None]:
cols = [
"target",
"temp",
"temp_pred",
"population",
"is_weekend",
"atm_pressure_yesterday",
"humidity_yesterday",
"wind_speed_yesterday"] + \
train.columns[train.columns.str.contains('precipitation_pred')].to_list()

In [None]:
test = test[cols].replace({True: 1, False: 0})

In [None]:
train = train[cols].replace({True: 1, False: 0})

In [None]:
# arima_with_fourier = pipeline.Pipeline([
#     ("fourier", ppc.FourierFeaturizer(m=12, k=4)),
#     ("arima", pmdarima.arima.AutoARIMA(stepwise=True, trace=1, error_action="ignore",
#                               seasonal=False,  # because we use Fourier
#                               suppress_warnings=True))
# ])

# arima_with_fourier.fit(train['target'], train[cols].drop('target', axis=1))
# print(arima_with_fourier)

In [None]:
from statsmodels.tsa.arima.model import ARIMA

arima_model = ARIMA(train['target'], train[cols].drop('target', axis=1), order=(3, 1, 2)).fit()
print(arima_model.summary())

In [None]:
arima_model.plot_diagnostics(figsize = (20, 10))
plt.tight_layout()
plt.show()

In [None]:
arima_model.forecast(len(test), exog=test[cols].drop('target', axis=1))

In [None]:
type(arima_model)

In [None]:
# help(arima_model.predict)

In [None]:
tracker.add_experiment(arima_model, train[cols], test[cols], name = "Statsmodels ARIMA baseline")

In [None]:
tracker.display_metrics()

# LightGBM

In [None]:
train = datasets["prepared_train"]
test = datasets["prepared_test"]

In [None]:
removed_cols = ['date', 'ds', 'wind_direction', 'wind_direction_yesterday']
train = pd.get_dummies(train.drop(removed_cols, axis=1)).set_index('datetime')
test = pd.get_dummies(test.drop(removed_cols, axis=1)).set_index('datetime')

In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(train, test_size=0.2)

In [None]:
import lightgbm as lgb

In [None]:
lgb_params = {'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 10000,
              'early_stopping_rounds': 200,
              'nthread': -1}

In [None]:
cols = train.drop('target', axis=1).columns.to_list()

In [None]:
lgbtrain = lgb.Dataset(data=train.drop('target', axis=1), label=train['target'], feature_name=cols)
lgbval = lgb.Dataset(data=val.drop('target', axis=1), label=val['target'],  reference=lgbtrain, feature_name=cols)


In [None]:
model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval])

In [None]:
model.predict(train.drop('target', axis=1))

In [None]:
evaluate(test['target'], model.predict(test.drop('target', axis=1)))

In [None]:
tracker

In [None]:
# model = lgb.train(lgb_params, lgbtrain,
#                   valid_sets=[lgbtrain, lgbval],
#                   num_boost_round=lgb_params['num_boost_round'],
#                   early_stopping_rounds=lgb_params['early_stopping_rounds'],
#                 #   feval=lgbm_smape,
#                   verbose_eval=100)

In [None]:
tracker.add_experiment(model, train, test, "LightGBM")

# Метрики и сохранение результатов

In [None]:
tracker.display_metrics()

In [None]:
tracker.get_best_experiment()['model']

In [None]:
tracker.save()

In [None]:
ExperimentTracker.load_tracker().display_metrics()