# Water4Cities Infrastructure to support Data Mining
### W4C Webinar #1
Klemen Kenda, Matej Senožetnik, JSI @ SingularLogic, Nov 24th 2017

## Importing libraries

In [None]:
# loading data
import urllib.request

# data manipulation
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
# enable interactive plot in the notebook
%matplotlib notebook

# machine learning methods
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

## Loading and formatting the data

In [None]:
# API URL to underground water levels in Ljubljana aquifer
# station with id 85076 is 1279238400Lj - RTV (0261) Ljubljansko polje
url = "http://atena.ijs.si:8080/CollectorAPIServer/undergroundWater?station_id=85069";
jsonStr = urllib.request.urlopen(url).read().decode('utf-8');
df = pd.read_json(jsonStr);

# converting unix timestamp to date-time object
df['Date'] = df['LastUpdatedEpoch'];
df['Date'] = pd.to_datetime(df['Date'], unit='ms');

# remove unneccessary fields
df.drop('LastUpdated', 1, inplace=True);
df.drop('LastUpdatedEpoch', 1, inplace=True);
df.drop('Region_id', 1, inplace=True);
df.drop('Region_name', 1, inplace=True);
df.drop('Station_id', 1, inplace=True);
df.drop('Station_name', 1, inplace=True);
df.drop('SystemCodeNumber', 1, inplace=True);

## Checking loaded data

In [None]:
# string/tabular view
# len(jsonStr), jsonStr[0:100] and df[0:5]
df[0:5]

In [None]:
# plotting the data
# format date axis
fig, ax = plt.subplots(); fig.autofmt_xdate();
# plot
ax.plot(df['Date'], df['Value']);
# show plot
plt.show();

## Loading some additional data for modeling (weather)

In [None]:
# API URL to underground water levels in Ljubljana aquifer
# station with id 85076 is 1279238400Lj - RTV (0261) Ljubljansko polje
url = "http://atena.ijs.si:8080/CollectorAPIServer/weather?time_from=31/12/2014&time_to=01/01/2017&city=Ljubljana";
jsonStr = urllib.request.urlopen(url).read().decode('utf-8');
dw = pd.read_json(jsonStr);

# converting unix timestamp to date-time object
dw['Date'] = dw['LastUpdatedEpoch'];
dw['Date'] = pd.to_datetime(dw['Date'] + 2 * 60 * 60 * 1000, unit='ms').dt.round("1d");
dw.set_index('Date', inplace=True);

dw.drop('City', 1, inplace=True);
dw.drop('LastUpdated', 1, inplace=True);
dw.drop('LastUpdatedEpoch', 1, inplace=True);
dw.drop('Sensor_id', 1, inplace=True);
dw.drop('Sensor_name', 1, inplace=True);
dw.drop('SystemCodeNumber', 1, inplace=True);


In [None]:
dw[0:5]

## Data Fusion

In [None]:
# make date a key in our data
df.set_index('Date', inplace=True);

In [None]:
df[0:5]

In [None]:
# generate fused dataset
ds = pd.concat([df, dw], axis=1);

In [None]:
# let's check it
ds[0:10]

In [None]:
plt.figure(4)
plt.plot(dw['Percipitation'])
plt.plot(dw['Snow_blanket'])

## Exploratory data analysis
This is a very big deal for successful data mining; we will not go deep into this. We will just do some very basic visualizations.

In [None]:
plt.figure(2);
plt.subplot(211);
plt.plot(ds.index, ds['Value'], 'r');
plt.subplot(212);
plt.plot(ds.index, ds['Percipitation'], 'b');
plt.show()

## Let the fun begin!
# Data-driven modeling
How well can we describe groundwater level with tha available features?

In [None]:
# Let's prepare the data, we need a vector y with our groundwater level 
# and a matrix X of corresponding features
y = ds['Value'];
X = ds.iloc[:, 1:9];

In [None]:
# linear regression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


#regressor = linear_model.LinearRegression();
#regressor = DecisionTreeRegressor();
#regressor = SVR();
#regressor = RandomForestRegressor();
regressor = GradientBoostingRegressor();

# cross_val_predict returns an array of the same size as `y` where each entry
# is a prediction obtained by cross validation:
predicted = cross_val_predict(regressor, X, y, cv = 10);

In [None]:
# plot modeling results
fig, ax = plt.subplots();
ax.plot(y.index, predicted, 'r');
ax.plot(y, 'b');
plt.show();

In [None]:
# evaluate model
mse = mean_squared_error(y, predicted)
r2 = r2_score(y, predicted);
(mse, r2)


In [None]:
# scatterplot
fig, ax = plt.subplots();
ax.scatter(y, predicted, s=2);
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1);
ax.set_xlabel('Measured');
ax.set_ylabel('Predicted');
plt.show();

## Are we looking at a good target variable?

In [None]:
# make new X and y
y = ds['Value'][:]
X = ds.iloc[1:, 1:10];

In [None]:
y = ds['Value'] - ds['Value'].shift(1)
y = y[1:]

## How about proper feature engineering?
Domain knowledge can help!

See data fusion / stream modeling demo!