In [None]:
import sys
from typing import List

import pandas as pd
import tensorflow as tf
import numpy as np
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

keras.utils.set_random_seed(1)
sys.path.append('src')

from data.read_data import read_data

def add_target(df, cases_column: str = "cases", target_column: str = 'target'):
    df[target_column] = df[cases_column].shift(-1)


def scale_data(df, columns: List[str]):
    # df[columns] = Normalizer().fit_transform(df[columns]) # does not improve at all
    # df[columns] = MinMaxScaler().fit_transform(df[columns]) # worse
    df[columns] = StandardScaler().fit_transform(df[columns]) # better



def split_data(df, targets_column: str):
    x = df.drop([targets_column], axis=1)
    y = df[targets_column].values
    del df
    return x, y


def add_windows(df: pd.DataFrame):
    step_features = []
    for step in range(1, N_STEPS):
        d = pd.DataFrame()
        for column in df:
            d[f'{column}-{step}'] = df[column].shift(step)
        step_features.append(d)

    all = [df, *step_features]
    all_values = [df.values for df in all]

    _2d = pd.concat(all, axis=1).values  # for dataframe
    _3d = np.stack(all_values)

    return _2d, _3d


def remove_rows_with_nan(matrix, tensor, targets):
    assert matrix.shape[0] == tensor.shape[1] == len(targets)
    matrix = matrix[N_STEPS - 1:-1]
    tensor = tensor[:, N_STEPS - 1:-1, :]
    targets = targets[N_STEPS - 1:-1]
    return matrix, tensor, targets


def add_aggregates_2d(matrix):
    cases_columns = [CASES_COLUMN_IDX + (columns_number * step) for step in range(N_STEPS)]

    mean_values = np.mean(matrix[:, cases_columns], axis=1)
    min_values = np.min(matrix[:, cases_columns], axis=1)
    max_values = np.max(matrix[:, cases_columns], axis=1)
    median_values = np.median(matrix[:, cases_columns], axis=1)
    delta = max_values - min_values

    return np.c_[matrix, min_values, max_values, mean_values, median_values, delta]


def append_vector_to_tensor(tensor, vector):
    vector = np.expand_dims(vector, axis=0)
    repeated_vector = np.repeat(vector[:, np.newaxis, :], tensor.shape[0], axis=0)
    transposed_tensor = np.transpose(repeated_vector, (0, 2, 1))
    new_tensor = np.concatenate((tensor, transposed_tensor), axis=2)
    return new_tensor


def add_aggregates_3d(tensor):
    mean_values = np.mean(tensor[:, :, CASES_COLUMN_IDX], axis=0)
    min_values = np.min(tensor[:, :, CASES_COLUMN_IDX], axis=0)
    max_values = np.max(tensor[:, :, CASES_COLUMN_IDX], axis=0)
    median_values = np.median(tensor[:, :, CASES_COLUMN_IDX], axis=0)
    delta = max_values - min_values

    tensor = append_vector_to_tensor(tensor, mean_values)
    tensor = append_vector_to_tensor(tensor, min_values)
    tensor = append_vector_to_tensor(tensor, max_values)
    tensor = append_vector_to_tensor(tensor, median_values)
    tensor = append_vector_to_tensor(tensor, delta)
    return tensor

N_STEPS = 27

# l = [[1, 0, 0], [2, 0, 0], [3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0], [8, 0, 0], [9, 0, 0], [10, 0, 0]]
# df = pd.DataFrame(l, columns=['cases', 'b', 'c'])
# df = df.drop(["b", "c"], axis=1)

df = read_data()
df = df[df["countryterritoryCode"] == "POL"]
df['dateRep'] = pd.to_datetime(df['dateRep'], format="%d/%m/%Y")
df = df.sort_values("dateRep")
df = df.drop(
    ["countryterritoryCode", "continentExp", "geoId", "countriesAndTerritories", "dateRep", "day", "month", "year",
     "popData2020", "deaths"], axis=1)

columns_number = len(df.columns)
CASES_COLUMN_IDX = 0

add_target(df)
scale_data(df, columns=['cases'])
x, y = split_data(df, 'target')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False)
del x, y
x_train_2d, x_train_3d = add_windows(x_train)
x_test_2d, x_test_3d = add_windows(x_test)
del x_train
x_train_2d, x_train_3d, y_train = remove_rows_with_nan(x_train_2d, x_train_3d, y_train)
x_test_2d, x_test_3d, y_test = remove_rows_with_nan(x_test_2d, x_test_3d, y_test)

x_train_2d = add_aggregates_2d(x_train_2d)
x_test_2d = add_aggregates_2d(x_test_2d)

x_train_3d = add_aggregates_3d(x_train_3d)
x_test_3d = add_aggregates_3d(x_test_3d)

keras_model = keras.Sequential([
        keras.layers.Bidirectional(
            keras.layers.SimpleRNN(units=64, activation="relu"),
        ),
        keras.layers.Dense(units=4, activation="relu"),
        keras.layers.Dense(units=1)

    ])
# Print the model summary
keras_model.compile(loss=keras.losses.MeanSquaredError(),
              optimizer=keras.optimizers.Adam(),
              metrics=[keras.metrics.MeanAbsoluteError()])
xtensor_train = tf.convert_to_tensor(np.transpose(x_train_3d, (1, 0, 2)), dtype=tf.float16)
xtensor_test = tf.convert_to_tensor(np.transpose(x_test_3d, (1, 0, 2)), dtype=tf.float16)

keras_model.fit(xtensor_train, y_train, epochs=50)

predicted_keras = keras_model.predict(xtensor_test)

rmse_keras = mean_squared_error(predicted_keras, y_test, squared=False)
r2_keras = r2_score(predicted_keras, y_test)

print("nn", rmse_keras, r2_keras)

sk_model = DecisionTreeRegressor()

sk_model.fit(x_train_2d, y_train)

predicted_sk = sk_model.predict(x_test_2d)
rmse_sk = mean_squared_error(predicted_sk, y_test, squared=False)
r2_sk = r2_score(predicted_sk, y_test)

print("sk", rmse_sk, r2_sk)