In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd

In [2]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [3]:
df_50 = pd.read_csv('data_50_final.csv')
df_60 = pd.read_csv('data_60_final.csv')

In [4]:
pred_idx = df_50[df_50.day_diff == 17].index    # indices AFTER which we have to predict
pred_idx

Int64Index([  105,   211,   317,   423,   529,   635,   741,   847,   953,
             1059,
            ...
            16535, 16641, 16747, 16853, 16959, 17065, 17171, 17277, 17383,
            17489],
           dtype='int64', length=165)

In [5]:
def train_interval_data(df, data_idx, win):
    xdata, ydata = [], []

    for idx in range(data_idx-90+1+win,data_idx+1):
    
        y_out = df.water_level.iloc[idx]#.to_numpy()
        x_ip = df[['discharge', 'incoming', 'RS', 'NSH_TAG']].iloc[idx].to_numpy()
        # y_ip = y[idx-win:idx]
        y_ip = df.water_level.iloc[idx-win:idx].to_numpy()

        all_ip = np.append(x_ip, y_ip)
        xdata.append(all_ip)
        ydata.append(y_out)
    # print(y_out)

    return xdata, ydata

def train_data(df, win=5):
    xtrain, ytrain = [], []
    for index in pred_idx:
        xtr, ytr = train_interval_data(df, index, win)
        xtrain.append(xtr)
        ytrain.append(ytr)

    return np.array(xtrain), np.array(ytrain)

In [46]:
XT, YT = train_data(df_50, win=20)

x_train = XT.reshape(-1, XT.shape[-1])
y_train = YT.reshape(-1)

In [8]:
def test_parser(df, model, win=5):
    test_pred = []
    diff_pred = []
    for index in pred_idx:
        y_ip = df.water_level.iloc[index-win+1:index+1].to_numpy() # For the 1st day
        y0 = df.water_level.iloc[index]  # Value with which to compare
        for days in range(3):
            x_ip = df[['discharge', 'incoming', 'RS', 'NSH_TAG']].iloc[index+days].to_numpy()
            all_ip = np.append(x_ip, y_ip).reshape(1,-1)
            
            y_out = model.predict( all_ip )
            test_pred.append(y_out.flatten())

            diff_pred.append(y_out.flatten()[0] - y0)
            y_ip = np.append(y_ip, y_out)[1:]     # updating with new y_out and removing the 1st element
  
    return diff_pred

# predictions = test_parser(df_50 ,linear_m, win=10)

def write_output(fname, predictions):
    with open(fname, 'a+') as f:
        for pred in predictions:
            f.write(str(pred)+'\n')

In [39]:
y_train.shape

(13200,)

In [18]:
linear_m.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [19]:
y_pred = linear_m.predict(x_train)
y_pred

array([364.03925861, 355.53626534, 346.40730781, ..., 338.44422002,
       317.62901406, 297.1658992 ])

In [24]:
y_pred[:12]

array([364.03925861, 355.53626534, 346.40730781, 344.13185678,
       333.37262391, 375.40591484, 488.34653811, 539.04736554,
       501.54020659, 464.62889459, 440.29267391, 410.73790201])

In [29]:
y_train[:12]

array([368., 359., 353., 334., 331., 416., 516., 536., 506., 473., 441.,
       416.])

4400.0

In [22]:
def get_values(y):
    y1, y2, y3 = [], [], []
    for i, y in enumerate(y):
        if i%3 == 0:
            y1.append(y)
        elif i%3 == 1:
            y2.append(y)
        else:
            y3.append(y)

    y1 = np.array(y1)
    y2 = np.array(y2)       
    y3 = np.array(y3)       

    return y1, y2, y3

In [41]:
def find_R2(Ypred, Yact):
    
    Y_mean = sum(Yact) / len(Yact)    
    y_a1, y_a2, y_a3 = get_values(Yact)
    
    y_p1, y_p2, y_p3 = get_values(Ypred)  
    
    Nr = sum((y_a1-y_p1)**2) + sum((y_a2-y_p2)**2) + sum((y_a2-y_p2)**2) 
    Dr = sum((y_a1-Y_mean)**2) + sum((y_a2-Y_mean)**2) + sum((y_a2-Y_mean)**2)     
    
    r2 = 1 - Nr/Dr
    
    return r2

In [42]:
find_R2(y_pred, y_train)

0.995060809672045

**Validated model**

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(
    x_train, y_train, test_size=0.2, random_state=42)

linear_m.fit(X_train,Y_train)

y_pred = linear_m.predict(X_val)

find_R2(y_pred, Y_val)

0.995787608528998

In [49]:
X_train.shape

(9240, 24)

In [36]:
a1 = np.arange(10)
a2 = np.arange(10)
a3 = np.arange(10)

avg = (a1+a2+a3)/3
avg

array([0., 1., 2., 3., 4., 5., 6., 7., 8., 9.])

## 1. SVR

In [None]:
from sklearn import svm
svr = svm.SVR()

## 2. Linear

In [1]:
from sklearn import linear_model
linear_m = linear_model.LinearRegression()

In [86]:
from sklearn.linear_model import PassiveAggressiveRegressor
PAR = PassiveAggressiveRegressor(max_iter=100, random_state=0, tol=1e-3)

from sklearn.linear_model import TheilSenRegressor
TSR = TheilSenRegressor(random_state=0)

## 3. Ensemble

In [105]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor

reg1 = GradientBoostingRegressor(random_state=1, n_estimators=100)
reg2 = RandomForestRegressor(random_state=1, n_estimators=100)
reg3 = LinearRegression()
ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])

## 4. Probabilistic

In [75]:
from sklearn import linear_model
bayes = linear_model.BayesianRidge()

In [94]:
def whole_routine(df, model, fname, win= 10):
    XT, YT = train_data(df, win)

    x_train = XT.reshape(-1, XT.shape[-1])
    y_train = YT.reshape(-1)
    
    # Initiate model class only
    
    model.fit(x_train, y_train) 
    predictions = test_parser(df ,model, win) 
    
    write_output(fname, predictions)

In [114]:
whole_routine(df_50, linear_m, 'linear_m_5.csv', 5)

In [115]:
whole_routine(df_60, linear_m, 'linear_m_5.csv', 5)

In [122]:
whole_routine(df_50, reg1, 'reg2_5.csv', 5)

In [123]:
whole_routine(df_60, reg2, 'reg2_5.csv', 5)

In [124]:
whole_routine(df_50, bayes, 'bayes_5.csv', 5)

In [125]:
whole_routine(df_60, bayes, 'bayes_5.csv', 5)

In [118]:
whole_routine(df_50, PAR, 'PAR_5.csv', 5)

In [119]:
whole_routine(df_60, PAR, 'PAR_5.csv', 5)

In [120]:
whole_routine(df_50, TSR, 'TSR_5.csv', 5)

In [121]:
whole_routine(df_60, TSR, 'TSR_5.csv', 5)