## Importing libraries

In [617]:
# loading data
import urllib.request

# data manipulation
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
# enable interactive plot in the notebook
%matplotlib notebook

# machine learning methods
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

## Loading and formatting the data

In [708]:
# API URL to underground water levels in Ljubljana aquifer
# station with id 85076 is 1279238400Lj - RTV (0261) Ljubljansko polje
url = "http://atena.ijs.si:8080/CollectorAPIServer/undergroundWater?station_id=85012";
jsonStr = urllib.request.urlopen(url).read().decode('utf-8');
df = pd.read_json(jsonStr);

# converting unix timestamp to date-time object
df['Date'] = df['LastUpdatedEpoch'];
df['Date'] = pd.to_datetime(df['Date'], unit='ms');

# remove unneccessary fields
df.drop('LastUpdated', 1, inplace=True);
df.drop('LastUpdatedEpoch', 1, inplace=True);
df.drop('Region_id', 1, inplace=True);
df.drop('Region_name', 1, inplace=True);
df.drop('Station_id', 1, inplace=True);
df.drop('Station_name', 1, inplace=True);
df.drop('SystemCodeNumber', 1, inplace=True);

## Checking loaded data

In [709]:
# string/tabular view
# len(jsonStr), jsonStr[0:100] and df[0:5]
df[0:5]

Unnamed: 0,Value,Date
0,289.27,2001-01-01
1,289.24,2001-01-02
2,289.22,2001-01-03
3,289.29,2001-01-04
4,289.25,2001-01-05


In [710]:
# plotting the data
# format date axis
fig, ax = plt.subplots(); fig.autofmt_xdate();
# plot
ax.plot(df['Date'], df['Value']);
# show plot
plt.show();

<IPython.core.display.Javascript object>

## Loading some additional data for modeling (weather)

In [635]:
# API URL to underground water levels in Ljubljana aquifer
# station with id 85076 is 1279238400Lj - RTV (0261) Ljubljansko polje
url = "http://atena.ijs.si:8080/CollectorAPIServer/weather?time_from=31/12/2014&time_to=01/01/2017&city=Ljubljana";
jsonStr = urllib.request.urlopen(url).read().decode('utf-8');
dw = pd.read_json(jsonStr);

# converting unix timestamp to date-time object
dw['Date'] = dw['LastUpdatedEpoch'];
dw['Date'] = pd.to_datetime(dw['Date'] + 2 * 60 * 60 * 1000, unit='ms').dt.round("1d");
dw.set_index('Date', inplace=True);

dw.drop('City', 1, inplace=True);
dw.drop('LastUpdated', 1, inplace=True);
dw.drop('LastUpdatedEpoch', 1, inplace=True);
dw.drop('Sensor_id', 1, inplace=True);
dw.drop('Sensor_name', 1, inplace=True);
dw.drop('SystemCodeNumber', 1, inplace=True);


In [636]:
dw[0:5]

Unnamed: 0_level_0,CloudCover,New_snow_blanket,Percipitation,Snow_blanket,Sun_duration,TemperatureAvg,TemperatureMax,TemperatureMin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2015-01-01,70,0,0.0,13,2.1,-5.8,-3.0,-9.6
2015-01-02,83,0,0.0,12,2.1,-0.6,1.5,-5.5
2015-01-03,80,0,0.0,10,0.0,-1.0,0.6,-4.5
2015-01-04,20,0,0.2,8,7.9,3.1,10.3,-0.5
2015-01-05,50,0,0.0,4,5.9,3.0,8.5,0.0


## Data Fusion

In [711]:
# make date a key in our data
df.set_index('Date', inplace=True);

In [712]:
df = df.loc['2014-12-31':'2017-01-01']

In [713]:
df[0:5]

Unnamed: 0_level_0,Value
Date,Unnamed: 1_level_1
2014-12-31,288.2
2015-01-01,288.18
2015-01-02,288.16
2015-01-03,288.13
2015-01-04,288.12


In [714]:
# generate fused dataset
ds = pd.concat([df, dw], axis=1);

In [715]:
# let's check it
ds[-10:]

Unnamed: 0_level_0,Value,CloudCover,New_snow_blanket,Percipitation,Snow_blanket,Sun_duration,TemperatureAvg,TemperatureMax,TemperatureMin
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-12-22,288.18,100.0,0.0,0.0,0.0,0.0,-3.3,-2.7,-4.1
2016-12-23,288.2,50.0,0.0,0.0,0.0,2.0,-3.3,-0.8,-4.0
2016-12-24,288.17,83.0,0.0,0.0,0.0,0.5,-2.5,0.5,-5.5
2016-12-25,288.14,97.0,0.0,0.0,0.0,0.0,-2.3,0.3,-3.7
2016-12-26,288.09,63.0,0.0,0.0,0.0,5.5,0.3,6.8,-3.5
2016-12-27,288.07,30.0,0.0,0.0,0.0,6.8,3.6,8.9,-1.2
2016-12-28,288.05,70.0,0.0,0.0,0.0,6.2,5.3,13.1,-1.0
2016-12-29,288.07,13.0,0.0,0.1,0.0,8.1,-0.5,4.2,-2.4
2016-12-30,288.07,0.0,0.0,0.0,0.0,5.7,-2.9,2.0,-6.6
2016-12-31,288.13,3.0,0.0,0.0,0.0,7.5,-3.5,3.0,-7.6


# .........

In [716]:
ds['DValue'] = ds['Value'] - ds['Value'].shift(1)

In [717]:
#dnew = ds
#for i in range(26):
#    dnew['Sun_duration' + '_shift_' + str(i)+'d'] = ds['Sun_duration'].shift(i)
#for i in range(50,101,10):
#    dnew['Sun_duration' + '_shift_' + str(i)+'d'] = ds['Sun_duration'].shift(i)

In [718]:
#shift of data for n days
def shift_of_data(string, data, new_data):
    for i in range(26):
        new_data[string + '_shift_' + str(i)+'d'] = data[string].shift(i)
    for i in range(30,101,10):
        new_data[string + '_shift_' + str(i)+'d'] = data[string].shift(i)

In [719]:
import copy
dnew = copy.deepcopy(ds)

data_names = ['CloudCover', 'New_snow_blanket', 'Percipitation','Snow_blanket',
              'Sun_duration', 'TemperatureAvg', 'TemperatureMax', 'TemperatureMin']

for string in data_names:
    shift_of_data(string, ds, dnew)

In [720]:
dnew

Unnamed: 0_level_0,Value,CloudCover,New_snow_blanket,Percipitation,Snow_blanket,Sun_duration,TemperatureAvg,TemperatureMax,TemperatureMin,DValue,...,TemperatureMin_shift_24d,TemperatureMin_shift_25d,TemperatureMin_shift_30d,TemperatureMin_shift_40d,TemperatureMin_shift_50d,TemperatureMin_shift_60d,TemperatureMin_shift_70d,TemperatureMin_shift_80d,TemperatureMin_shift_90d,TemperatureMin_shift_100d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-12-31,288.2,,,,,,,,,,...,,,,,,,,,,
2015-01-01,288.18,70.0,0.0,0.0,13.0,2.1,-5.8,-3.0,-9.6,-0.02,...,,,,,,,,,,
2015-01-02,288.16,83.0,0.0,0.0,12.0,2.1,-0.6,1.5,-5.5,-0.02,...,,,,,,,,,,
2015-01-03,288.13,80.0,0.0,0.0,10.0,0.0,-1.0,0.6,-4.5,-0.03,...,,,,,,,,,,
2015-01-04,288.12,20.0,0.0,0.2,8.0,7.9,3.1,10.3,-0.5,-0.01,...,,,,,,,,,,
2015-01-05,288.13,50.0,0.0,0.0,4.0,5.9,3.0,8.5,0.0,0.01,...,,,,,,,,,,
2015-01-06,288.12,33.0,0.0,0.0,3.0,4.4,-0.4,4.9,-3.6,-0.01,...,,,,,,,,,,
2015-01-07,288.13,50.0,0.0,0.0,3.0,1.1,-0.9,2.7,-2.7,0.01,...,,,,,,,,,,
2015-01-08,288.11,83.0,0.0,0.0,3.0,0.2,0.2,2.3,-4.1,-0.02,...,,,,,,,,,,
2015-01-09,288.1,90.0,0.0,0.0,2.0,0.6,4.9,7.5,1.0,-0.01,...,,,,,,,,,,


In [721]:
dnew.corr()

Unnamed: 0,Value,CloudCover,New_snow_blanket,Percipitation,Snow_blanket,Sun_duration,TemperatureAvg,TemperatureMax,TemperatureMin,DValue,...,TemperatureMin_shift_24d,TemperatureMin_shift_25d,TemperatureMin_shift_30d,TemperatureMin_shift_40d,TemperatureMin_shift_50d,TemperatureMin_shift_60d,TemperatureMin_shift_70d,TemperatureMin_shift_80d,TemperatureMin_shift_90d,TemperatureMin_shift_100d
Value,1.0,0.167822,-0.04442,0.349605,-0.151059,-0.12005,-0.057861,-0.074827,-0.019444,0.22688,...,-0.092661,-0.091242,-0.083809,-0.14344,-0.154921,-0.108603,-0.062075,-0.126568,-0.092199,-0.091713
CloudCover,0.167822,1.0,0.122452,0.219788,0.162879,-0.83274,-0.31504,-0.429054,-0.138423,0.088799,...,-0.173422,-0.166982,-0.119626,-0.11279,-0.065826,-0.011018,0.005857,0.032737,0.060471,0.078929
New_snow_blanket,-0.04442,0.122452,1.0,0.182088,0.518093,-0.111978,-0.162496,-0.171834,-0.143569,-0.006344,...,-0.139878,-0.128508,-0.123973,-0.103793,-0.059714,-0.067943,-0.050597,-0.022405,0.000624,-0.030393
Percipitation,0.349605,0.219788,0.182088,1.0,0.029835,-0.178188,-0.010982,-0.044188,0.062845,0.478686,...,0.053165,0.021458,0.015628,0.028124,0.040628,0.011282,0.010889,-0.026575,0.007282,-0.050968
Snow_blanket,-0.151059,0.162879,0.518093,0.029835,1.0,-0.178759,-0.305313,-0.317375,-0.294889,-0.004379,...,-0.197011,-0.201968,-0.231135,-0.232043,-0.065711,-0.087937,-0.050658,-0.043067,0.028343,-0.007481
Sun_duration,-0.12005,-0.83274,-0.111978,-0.178188,-0.178759,1.0,0.596189,0.683709,0.407157,-0.091099,...,0.356407,0.348565,0.285265,0.21985,0.155838,0.056702,-0.025394,-0.114992,-0.196478,-0.267138
TemperatureAvg,-0.057861,-0.31504,-0.162496,-0.010982,-0.305313,0.596189,1.0,0.979318,0.957507,0.013542,...,0.714557,0.706291,0.643605,0.588929,0.433107,0.329767,0.167204,0.033856,-0.164478,-0.289784
TemperatureMax,-0.074827,-0.429054,-0.171834,-0.044188,-0.317375,0.683709,0.979318,1.0,0.906927,-0.002506,...,0.695485,0.688403,0.623542,0.568491,0.421246,0.304561,0.156162,0.020409,-0.160841,-0.291403
TemperatureMin,-0.019444,-0.138423,-0.143569,0.062845,-0.294889,0.407157,0.957507,0.906927,1.0,0.050005,...,0.717036,0.71313,0.6652,0.622597,0.462529,0.380717,0.229227,0.091839,-0.1134,-0.232108
DValue,0.22688,0.088799,-0.006344,0.478686,-0.004379,-0.091099,0.013542,-0.002506,0.050005,1.0,...,0.00016,0.005039,-0.006214,0.004523,0.019728,0.021107,0.00077,-0.000258,0.014908,-0.011631


In [722]:
#correlation for shifts
#plt.matshow(dnew.corr())

In [723]:
#average for last n days
def average_last_n_days(string, data, new_data):
    for i in range(26):
        new_data[string + '_average_' + str(i)+'d'] = data[string].rolling(i).sum()/i
    for i in range(30,101,10):
        new_data[string + '_average_' + str(i)+'d'] = data[string].rolling(i).sum()/i

In [724]:
#for i in range(26):
#    dnew['Sun_duration' + '_sum_' + str(i)+'d'] = ds['Sun_duration'].rolling(i).sum()

In [725]:
#data_names = ['CloudCover', 'New_snow_blanket', 'Percipitation','Snow_blanket',
#              'Sun_duration', 'TemperatureAvg', 'TemperatureMax', 'TemperatureMin']

#dnew2 = copy.deepcopy(ds)
for string in data_names:
    average_last_n_days(string, ds, dnew)  

In [726]:
#dnew2.corr()

In [727]:
# correlation for shifts and average
plt.matshow(dnew.corr())

<IPython.core.display.Javascript object>

<matplotlib.image.AxesImage at 0x1a236e5748>

In [728]:
dnew.corr()['DValue']

Value                            0.226880
CloudCover                       0.088799
New_snow_blanket                -0.006344
Percipitation                    0.478686
Snow_blanket                    -0.004379
Sun_duration                    -0.091099
TemperatureAvg                   0.013542
TemperatureMax                  -0.002506
TemperatureMin                   0.050005
DValue                           1.000000
CloudCover_shift_0d              0.088799
CloudCover_shift_1d              0.194654
CloudCover_shift_2d              0.081718
CloudCover_shift_3d             -0.054066
CloudCover_shift_4d             -0.049916
CloudCover_shift_5d             -0.023459
CloudCover_shift_6d             -0.038149
CloudCover_shift_7d             -0.004832
CloudCover_shift_8d              0.000362
CloudCover_shift_9d             -0.002000
CloudCover_shift_10d            -0.010688
CloudCover_shift_11d            -0.001660
CloudCover_shift_12d            -0.008034
CloudCover_shift_13d             0

In [729]:
# sort for corelation with 'DValue' from max to min
dsort = dnew.corr().sort_values(['DValue'], ascending = False)

In [730]:
pd.set_option('display.max_rows', 1000)
dsort['DValue']

DValue                           1.000000
Percipitation_average_1d         0.478686
Percipitation                    0.478686
Percipitation_shift_0d           0.478686
Percipitation_average_2d         0.338331
Value                            0.226880
CloudCover_shift_1d              0.194654
Percipitation_average_3d         0.188273
CloudCover_average_2d            0.161681
CloudCover_average_3d            0.151894
New_snow_blanket_shift_9d        0.139826
CloudCover_average_4d            0.104281
Percipitation_average_4d         0.095581
Percipitation_shift_50d          0.095539
CloudCover_shift_0d              0.088799
CloudCover                       0.088799
CloudCover_average_1d            0.088799
CloudCover_shift_2d              0.081718
Snow_blanket_shift_4d            0.075783
CloudCover_average_5d            0.074084
Snow_blanket_shift_3d            0.068422
TemperatureMin_shift_1d          0.061639
CloudCover_average_6d            0.058858
New_snow_blanket_average_10d     0

In [731]:
# ploted corelated quantities
fig, ax = plt.subplots(); fig.autofmt_xdate();
fig.set_size_inches(10, 5)
ax.plot(range(732), dnew['DValue'] / np.max(np.abs(dnew['DValue']),axis=0), label="Groundwater change");
ax.plot(range(732), dnew['Percipitation_average_6d'] / np.max(np.abs(dnew['Percipitation_average_6d']),axis=0), label="Precipitation avg. (6d)");
ax.plot(range(732), dnew['CloudCover_average_9d'] / np.max(np.abs(dnew['CloudCover_average_9d']),axis=0), label="Cloud cover avg. (9d)");
ax.legend(loc=4, borderaxespad=1)
plt.xlabel('Time (days)')
plt.ylabel('Normalized value')
plt.show();

<IPython.core.display.Javascript object>

In [732]:
fig.savefig('correlations.png', dpi=300)

In [658]:
#create new data frame for chosen colerated quantities
data_corelated = pd.DataFrame()
corelated = ['Percipitation_average_6d',
             'Percipitation_average_10d',
             'Percipitation_average_25d',
             'CloudCover_average_9d',
             'Sun_duration_average_9d'
            ]
data_corelated['DValue'] = copy.deepcopy(dnew['DValue'])

for corelated in corelated:
    data_corelated[corelated] = copy.deepcopy(dnew[corelated])


In [733]:
data_corelated

Unnamed: 0_level_0,DValue,Percipitation_average_6d,Percipitation_average_10d,Percipitation_average_25d,CloudCover_average_9d,Sun_duration_average_9d
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-12-31,,,,,,
2015-01-01,,,,,,
2015-01-02,,,,,,
2015-01-03,,,,,,
2015-01-04,,,,,,
2015-01-05,,,,,,
2015-01-06,,0.03333333,,,,
2015-01-07,,0.03333333,,,,
2015-01-08,,0.03333333,,,,
2015-01-09,,0.03333333,,,62.111111,2.7


# Learn

In [734]:
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model
from sklearn import tree
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import Normalizer

from sklearn.metrics import mean_squared_error, r2_score

In [661]:
#create arrays x (corelated) and y ('DValue')
#y = dnew['Value']
y = data_corelated['DValue']
y = y.values.astype(np.float)[30:]
X = data_corelated.iloc[:, 1:len(corelated)]
X= X.values.astype(np.float)[30:]

In [735]:
y.shape

(702,)

In [736]:
X.shape

(702, 5)

In [737]:
#choose regressor

regressor = linear_model.LinearRegression();
regressor = DecisionTreeRegressor();
regressor = RandomForestRegressor();
regressor = SVR();
regressor = GradientBoostingRegressor();

In [738]:
#predict
predicted = cross_val_predict(regressor, X, y, cv = 3)

In [751]:
#plot predicted values and real value
fig, ax = plt.subplots(); 
#fig.autofmt_xdate();
fig.set_size_inches(10, 5)
ax.plot(range(702), y, label="True value")
ax.plot(range(702), predicted, label="Prediction");
ax.legend(loc=1, borderaxespad=1)
plt.xlabel('Time (days)')
plt.ylabel('Groundwater level change (m)')
plt.show();

<IPython.core.display.Javascript object>

In [550]:
fig.savefig('prediction.png', dpi=300)

In [740]:
#evaluate results

mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted);
(mse, r2)

(0.00027860748557629526, 0.44601319884917334)

In [741]:
#evaluation for all regressors
regressor_list = (linear_model.LinearRegression(),
            DecisionTreeRegressor(),
            RandomForestRegressor(),
            SVR(),
            MLPRegressor(),
            GradientBoostingRegressor())

for regressor in regressor_list:
    predicted = cross_val_predict(regressor, X, y, cv = 3)
    mse = mean_squared_error(y, predicted)
    r2 = r2_score(y, predicted);
    print(mse, r2)

0.0002842619130699275 0.4347698605983825
0.0005118091168089618 -0.017688002295910055
0.0003246023736328884 0.3545563564349925
0.0011229515669520498 -1.232891715512487
1.999281665921732 -3974.3980493828903
0.0002804034444136253 0.4424420905954486


In [742]:
#sum
value = ds['Value']
value = value.values.astype(np.float)[30:]

predicted_value = []
predicted_value.append(value[0])
n=1
for i in predicted[1:]:
    predicted_value.append(predicted_value[n-1] + i)
    n += 1

In [743]:
fig, ax = plt.subplots(); fig.autofmt_xdate();
fig.set_size_inches(10, 5)
ax.plot(range(702), value, label="True value");
ax.plot(range(702), predicted_value, label="Prediction");
ax.legend(loc=1, borderaxespad=1)
plt.xlabel('Time (days)')
plt.ylabel('Groundwater level (m)')
plt.show();

<IPython.core.display.Javascript object>

In [555]:
fig.savefig('prediction_cum.png', dpi=300)

In [750]:
fig, ax = plt.subplots();
ax.plot(ds['DValue'][30:])
ax.plot(ds['DValue'][30:].rolling(5).mean())
ax.plot(ds['DValue'][30:].rolling(100).mean())
plt.show()

<IPython.core.display.Javascript object>

In [745]:
ds['DValue'][30:]


Date
2015-01-30   -0.01
2015-01-31   -0.02
2015-02-01   -0.03
2015-02-02   -0.04
2015-02-03   -0.04
2015-02-04   -0.03
2015-02-05   -0.01
2015-02-06    0.00
2015-02-07    0.04
2015-02-08   -0.01
2015-02-09    0.01
2015-02-10   -0.01
2015-02-11   -0.01
2015-02-12    0.00
2015-02-13    0.02
2015-02-14    0.01
2015-02-15   -0.02
2015-02-16    0.00
2015-02-17    0.02
2015-02-18    0.01
2015-02-19    0.01
2015-02-20    0.00
2015-02-21   -0.01
2015-02-22    0.02
2015-02-23    0.10
2015-02-24    0.02
2015-02-25    0.03
2015-02-26   -0.01
2015-02-27   -0.01
2015-02-28   -0.03
2015-03-01   -0.04
2015-03-02   -0.03
2015-03-03    0.01
2015-03-04   -0.01
2015-03-05   -0.01
2015-03-06    0.02
2015-03-07   -0.04
2015-03-08   -0.02
2015-03-09   -0.01
2015-03-10   -0.02
2015-03-11   -0.01
2015-03-12   -0.02
2015-03-13   -0.01
2015-03-14    0.00
2015-03-15   -0.01
2015-03-16   -0.01
2015-03-17    0.00
2015-03-18   -0.01
2015-03-19    0.01
2015-03-20    0.01
2015-03-21    0.00
2015-03-22    0.00
2015-03