In [1]:
import os
import sys

# Что бы стали доступны модули из ./scripts
sys.path.insert(0, os.path.abspath("../scripts"))

In [73]:
import pickle

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from statsmodels.tsa.ar_model import AR
from statsmodels.tools.eval_measures import rmse

# Ignore harmless warnings
import warnings
warnings.filterwarnings("ignore")

from tube.tools.cleaner import Cleaner
from tube.tools import dataset

In [3]:
DATA = "/home/ilya/iWorkspace/research/sibur-2020/data/tube"

train_features = os.path.join(DATA, "dataset", "train_features.csv")
train_targets = os.path.join(DATA, "dataset", "train_targets.csv")
test_features = os.path.join(DATA, "dataset", "test_features.csv")

MODELS = "/home/ilya/iWorkspace/research/sibur-2020/data/tube/models"

In [4]:
df_X = pd.read_csv(train_features, index_col="timestamp", parse_dates=True)
df_X.index.freq = "0.5H"

df_Y = pd.read_csv(train_targets, index_col="timestamp", parse_dates=True)
df_Y.index.freq = "0.5H"

df_test = pd.read_csv(test_features, index_col="timestamp", parse_dates=True)
df_test.index.freq = "0.5H"

In [5]:
df_X, _ = dataset.clear_train_dataset(df_X, df_Y)

In [6]:
# 2 по полчаса - 1 час; 24 часа; 7 дней
PREDICT_DEPTH = 2 * 24 * 7

# RESULT

## iC4H10

In [146]:
NAME = "iC4H10"

RESULT = []

FORECAST_DEPTH = PREDICT_DEPTH + 300 + 1

df = df_X.copy()
df_for_predict = None

first = True
start = 0
end = PREDICT_DEPTH

cleaner = Cleaner()

length = len(df_test)

fpath = os.path.join(MODELS, NAME + ".lr")
with open(fpath, 'rb') as fp:
    lr = pickle.load(fp)

while True:

    #print(start, end)

    if start > length:
        break

    known = df_test.iloc[ start : end ].copy()

    start_date = known.index[0]
    end_date = known.index[-1]
    print("START [{0} ; {1} )". format(start_date, end_date))

    ext_indx = pd.date_range(
        start=start_date,
        freq="0.5H",
        periods=FORECAST_DEPTH)
    df_for_predict = pd.DataFrame(
        np.zeros( (FORECAST_DEPTH, len(df.columns)) ),
        index=ext_indx,
        columns=df.columns)
    df_for_predict.index.name = 'timestamp'

    # FORECAST

    model = AR(df[ 'A_' + NAME ])
    ARfit = model.fit(method='mle')

    forecast_start = len(df)
    forecast_end = forecast_start + FORECAST_DEPTH
    rename = 'AR(11)'

    fcasts = ARfit.predict(start=forecast_start, end=forecast_end, dynamic=False).rename(rename)

    # Замена известных значений (они еще в будущем) на предсказанные
    i = 0
    for _, row in df_for_predict.iterrows():
        row[ 'A_' + NAME ] = fcasts.iloc[i]
        i += 1
    
    df_X_for_predict = pd.concat([df, df_for_predict])


    #PREDICT

    ds = dataset.prepare_eval_dataset(NAME, df_X_for_predict.shift(5))

    A_val = "A_{0}".format(NAME)
    B_val = "B_{0}".format(NAME)

    for column in [A_val]:
        for i in range(1, 300):
            ds[ "{0}-{1}".format(column, i) ] = ds[ column ].shift(i)
            ds[ "{0}+{1}".format(column, i) ] = ds[ column ].shift(-i)
    
    X = ds.dropna().drop(["A_rate", "B_rate"], axis=1)
    X = X[ start_date : end_date]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    predictions = lr.predict(X_scaled)

    # RESULT

    new_result = pd.DataFrame(predictions,
        index=X.index,
        columns=['B_' + NAME])
    new_result.index.name = 'timestamp'

    RESULT.append(new_result)


    df = pd.concat([df, known])
    df = cleaner.clean(df)

    start += PREDICT_DEPTH
    end += PREDICT_DEPTH

    #print("[{0} ; {1} )". format(df.index[0], df.index[-1]))

    print("DONE [{0} ; {1} )". format(start_date, end_date))
    print()
    #break

print("DONE")



START [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )
DONE [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )

START [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )
DONE [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )

START [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )
DONE [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )

START [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )
DONE [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )

START [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )
DONE [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )

START [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )
DONE [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )

START [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )
DONE [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )

START [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )
DONE [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )

START [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )
DONE [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )

START [2020-07-03 00:00:00 ; 2020-07-09 23:30:00 )
DONE [2020-07-03 00:00:00 ; 202

In [147]:
iC4H10 = pd.concat(RESULT)

In [None]:
# SAVE
iC4H10.to_csv("./submission.csv")

## C2H6

In [162]:
NAME = "C2H6"

RESULT = []

FORECAST_DEPTH = PREDICT_DEPTH + 300 + 1

df = df_X.copy()
df_for_predict = None

first = True
start = 0
end = PREDICT_DEPTH

cleaner = Cleaner()

length = len(df_test)

fpath = os.path.join(MODELS, NAME + ".lr")
with open(fpath, 'rb') as fp:
    lr = pickle.load(fp)

while True:

    #print(start, end)

    if start > length:
        break

    known = df_test.iloc[ start : end ].copy()

    start_date = known.index[0]
    end_date = known.index[-1]
    print("START [{0} ; {1} )". format(start_date, end_date))

    ext_indx = pd.date_range(
        start=start_date,
        freq="0.5H",
        periods=FORECAST_DEPTH)
    df_for_predict = pd.DataFrame(
        np.zeros( (FORECAST_DEPTH, len(df.columns)) ),
        index=ext_indx,
        columns=df.columns)
    df_for_predict.index.name = 'timestamp'

    # FORECAST

    model = AR(df[ 'A_' + NAME ])
    ARfit = model.fit(method='mle')

    forecast_start = len(df)
    forecast_end = forecast_start + FORECAST_DEPTH
    rename = 'AR(11)'

    fcasts = ARfit.predict(start=forecast_start, end=forecast_end, dynamic=False).rename(rename)

    # Замена известных значений (они еще в будущем) на предсказанные
    i = 0
    for _, row in df_for_predict.iterrows():
        row[ 'A_' + NAME ] = fcasts.iloc[i]
        i += 1
    

    model = AR(df[ 'A_rate' ])
    ARfit = model.fit(method='mle')

    fcasts = ARfit.predict(start=forecast_start, end=forecast_end, dynamic=False).rename(rename)

    # Замена известных значений (они еще в будущем) на предсказанные
    i = 0
    for _, row in df_for_predict.iterrows():
        row[ 'A_rate' ] = fcasts.iloc[i]
        i += 1


    model = AR(df[ 'B_rate' ])
    ARfit = model.fit(method='mle')

    fcasts = ARfit.predict(start=forecast_start, end=forecast_end, dynamic=False).rename(rename)

    # Замена известных значений (они еще в будущем) на предсказанные
    i = 0
    for _, row in df_for_predict.iterrows():
        row[ 'B_rate' ] = fcasts.iloc[i]
        i += 1

    df_X_for_predict = pd.concat([df, df_for_predict])


    #PREDICT

    ds = dataset.prepare_eval_dataset(NAME, df_X_for_predict.shift(10))

    A_val = "A_{0}".format(NAME)
    B_val = "B_{0}".format(NAME)

    for column in [A_val]:
        for i in range(170, 300):
            ds[ "{0}-{1}".format(column, i) ] = ds[ column ].shift(i)
            ds[ "{0}+{1}".format(column, i) ] = ds[ column ].shift(-i)
    
    X = ds.dropna()
    X = X[ start_date : end_date]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    predictions = lr.predict(X_scaled)

    # RESULT

    new_result = pd.DataFrame(predictions,
        index=X.index,
        columns=['B_' + NAME])
    new_result.index.name = 'timestamp'

    RESULT.append(new_result)


    df = pd.concat([df, known])
    df = cleaner.clean(df)

    start += PREDICT_DEPTH
    end += PREDICT_DEPTH

    #print("[{0} ; {1} )". format(df.index[0], df.index[-1]))

    print("DONE [{0} ; {1} )". format(start_date, end_date))
    print()
    #break

print("DONE")



START [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )
DONE [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )

START [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )
DONE [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )

START [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )
DONE [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )

START [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )
DONE [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )

START [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )
DONE [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )

START [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )
DONE [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )

START [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )
DONE [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )

START [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )
DONE [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )

START [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )
DONE [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )

START [2020-07-03 00:00:00 ; 2020-07-09 23:30:00 )
DONE [2020-07-03 00:00:00 ; 202

In [163]:
C2H6 = pd.concat(RESULT)

In [164]:
# SAVE
C2H6.to_csv("./submission.csv")

## C3H8

In [171]:
NAME = "C3H8"

RESULT = []

FORECAST_DEPTH = PREDICT_DEPTH + 20

df = df_X.copy()
df_for_predict = None

first = True
start = 0
end = PREDICT_DEPTH

cleaner = Cleaner()

length = len(df_test)

fpath = os.path.join(MODELS, NAME + ".lr")
with open(fpath, 'rb') as fp:
    lr = pickle.load(fp)

while True:

    #print(start, end)

    if start > length:
        break

    known = df_test.iloc[ start : end ].copy()

    start_date = known.index[0]
    end_date = known.index[-1]
    print("START [{0} ; {1} )". format(start_date, end_date))

    ext_indx = pd.date_range(
        start=start_date,
        freq="0.5H",
        periods=FORECAST_DEPTH)
    df_for_predict = pd.DataFrame(
        np.zeros( (FORECAST_DEPTH, len(df.columns)) ),
        index=ext_indx,
        columns=df.columns)
    df_for_predict.index.name = 'timestamp'

    # FORECAST

    model = AR(df[ 'A_' + NAME ])
    ARfit = model.fit(method='mle')

    forecast_start = len(df)
    forecast_end = forecast_start + FORECAST_DEPTH
    rename = 'AR(11)'

    fcasts = ARfit.predict(start=forecast_start, end=forecast_end, dynamic=False).rename(rename)

    # Замена известных значений (они еще в будущем) на предсказанные
    i = 0
    for _, row in df_for_predict.iterrows():
        row[ 'A_' + NAME ] = fcasts.iloc[i]
        i += 1


    df_X_for_predict = pd.concat([df, df_for_predict])


    #PREDICT

    ds = dataset.prepare_eval_dataset(NAME, df_X_for_predict.shift(55))

    A_val = "A_{0}".format(NAME)
    B_val = "B_{0}".format(NAME)

    for column in [A_val]:
        for i in range(45, 100):
            ds[ "{0}-{1}".format(column, i) ] = ds[ column ].shift(i)
            #ds[ "{0}+{1}".format(column, i) ] = ds[ column ].shift(-i)
    
    X = ds.dropna().drop(["A_rate", "B_rate"], axis=1)
    X = X[ start_date : end_date]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    predictions = lr.predict(X_scaled)

    # RESULT

    new_result = pd.DataFrame(predictions,
        index=X.index,
        columns=['B_' + NAME])
    new_result.index.name = 'timestamp'

    RESULT.append(new_result)


    df = pd.concat([df, known])
    df = cleaner.clean(df)

    start += PREDICT_DEPTH
    end += PREDICT_DEPTH

    #print("[{0} ; {1} )". format(df.index[0], df.index[-1]))

    print("DONE [{0} ; {1} )". format(start_date, end_date))
    print()
    #break

print("DONE")



START [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )
DONE [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )

START [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )
DONE [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )

START [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )
DONE [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )

START [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )
DONE [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )

START [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )
DONE [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )

START [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )
DONE [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )

START [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )
DONE [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )

START [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )
DONE [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )

START [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )
DONE [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )

START [2020-07-03 00:00:00 ; 2020-07-09 23:30:00 )
DONE [2020-07-03 00:00:00 ; 202

In [172]:
C3H8 = pd.concat(RESULT)

In [173]:
# SAVE
C3H8.to_csv("./submission.csv")

## nC4H10

In [181]:
NAME = "nC4H10"

RESULT = []

FORECAST_DEPTH = PREDICT_DEPTH + 350 + 1

df = df_X.copy()
df_for_predict = None

first = True
start = 0
end = PREDICT_DEPTH

cleaner = Cleaner()

length = len(df_test)

fpath = os.path.join(MODELS, NAME + ".lr")
with open(fpath, 'rb') as fp:
    lr = pickle.load(fp)

while True:

    #print(start, end)

    if start > length:
        break

    known = df_test.iloc[ start : end ].copy()

    start_date = known.index[0]
    end_date = known.index[-1]
    print("START [{0} ; {1} )". format(start_date, end_date))

    ext_indx = pd.date_range(
        start=start_date,
        freq="0.5H",
        periods=FORECAST_DEPTH)
    df_for_predict = pd.DataFrame(
        np.zeros( (FORECAST_DEPTH, len(df.columns)) ),
        index=ext_indx,
        columns=df.columns)
    df_for_predict.index.name = 'timestamp'

    # FORECAST

    model = AR(df[ 'A_' + NAME ])
    ARfit = model.fit(method='mle')

    forecast_start = len(df)
    forecast_end = forecast_start + FORECAST_DEPTH
    rename = 'AR(11)'

    fcasts = ARfit.predict(start=forecast_start, end=forecast_end, dynamic=False).rename(rename)

    # Замена известных значений (они еще в будущем) на предсказанные
    i = 0
    for _, row in df_for_predict.iterrows():
        row[ 'A_' + NAME ] = fcasts.iloc[i]
        i += 1
    
    df_X_for_predict = pd.concat([df, df_for_predict])


    #PREDICT

    ds = dataset.prepare_eval_dataset(NAME, df_X_for_predict)

    A_val = "A_{0}".format(NAME)
    B_val = "B_{0}".format(NAME)

    for column in [A_val]:
        for i in range(1, 350):
            ds[ "{0}-{1}".format(column, i) ] = ds[ column ].shift(i)
            ds[ "{0}+{1}".format(column, i) ] = ds[ column ].shift(-i)
    
    X = ds.dropna().drop(["A_rate", "B_rate"], axis=1)
    X = X[ start_date : end_date]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    predictions = lr.predict(X_scaled)

    # RESULT

    new_result = pd.DataFrame(predictions,
        index=X.index,
        columns=['B_' + NAME])
    new_result.index.name = 'timestamp'

    RESULT.append(new_result)


    df = pd.concat([df, known])
    df = cleaner.clean(df)

    start += PREDICT_DEPTH
    end += PREDICT_DEPTH

    #print("[{0} ; {1} )". format(df.index[0], df.index[-1]))

    print("DONE [{0} ; {1} )". format(start_date, end_date))
    print()
    #break

print("DONE")



START [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )
DONE [2020-05-01 00:00:00 ; 2020-05-07 23:30:00 )

START [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )
DONE [2020-05-08 00:00:00 ; 2020-05-14 23:30:00 )

START [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )
DONE [2020-05-15 00:00:00 ; 2020-05-21 23:30:00 )

START [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )
DONE [2020-05-22 00:00:00 ; 2020-05-28 23:30:00 )

START [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )
DONE [2020-05-29 00:00:00 ; 2020-06-04 23:30:00 )

START [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )
DONE [2020-06-05 00:00:00 ; 2020-06-11 23:30:00 )

START [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )
DONE [2020-06-12 00:00:00 ; 2020-06-18 23:30:00 )

START [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )
DONE [2020-06-19 00:00:00 ; 2020-06-25 23:30:00 )

START [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )
DONE [2020-06-26 00:00:00 ; 2020-07-02 23:30:00 )

START [2020-07-03 00:00:00 ; 2020-07-09 23:30:00 )
DONE [2020-07-03 00:00:00 ; 202

In [182]:
nC4H10 = pd.concat(RESULT)

In [183]:
# SAVE
nC4H10.to_csv("./submission.csv")

# RESULT

In [184]:
all = [iC4H10, C2H6, C3H8, nC4H10]
result = pd.concat(all, axis=1)

In [185]:
result.to_csv("./result.csv")