In [75]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [76]:
from google.cloud import datastore

datastore_client = datastore.Client(project='rareyetem')

In [77]:
def fetchRain(startDate, endDate, province):
    #print(province, startDate, endDate)
    # performing province to stationID conversion, a bit costly
    stationID = '327501'
    stationFile = 'static/thailand_metstation.csv'
    df = pd.read_csv(stationFile, dtype={"province": str, "region":str, "stncode":str})
    df = df[df["province"] == province]
    for idx, row in df.iterrows():
        stationID = row.stncode
        break

    #print('station id', stationID)
    query = datastore_client.query(kind='stndate')
    query.add_filter('stn', '=', stationID)
    query.add_filter('date','>', startDate)
    query.add_filter('date','<', endDate)
    rain = query.fetch()
    return rain

def fetchSLA(startDate, endDate):
    query = datastore_client.query(kind='date')
    query.add_filter('date','>', startDate)
    query.add_filter('date','<', endDate)
    query.order = ['-date']
    slas = query.fetch()
    return slas

# date is a key
sDate = (datetime.now() - timedelta(365)).strftime('%Y-%m-%d')
eDate = (datetime.now()).strftime('%Y-%m-%d')

# populating training data
provinces = pd.read_csv('static/thailand_metstation.csv', dtype={"province": str, "region":str, "stncode":str})


# query bob and s-index from google cloud datastore
sla = fetchSLA(sDate, eDate)
bsIndex = {}
for s in sla: # loop thru date
    bsIndex[s['date']] = [s['bpos']-s['bneg'], s['spos']-s['sneg']] # indexes is a tuple

#print(bsIndex)
data = []
for p in provinces.province: # for each date loop thru province
    rain = fetchRain(sDate, eDate, p)
    
    stnId = 0
    rainInfo = {}
    for r in rain:
        stnId = r['stn']
        rainInfo[r['date']] = r['rainmm']


    pLocation = provinces[provinces.stncode == stnId]
    lat = pLocation.iloc[0].lat
    lng = pLocation.iloc[0].lng
    

    # looping over days
    for date, bsList in bsIndex.items():
        bob = bsList[0]
        smt = bsList[1]
        try:
            data.append([''.join(date.split('-')[1:]), bob, smt, lat, lng, rainInfo[date]])
        except:
            data.append([''.join(date.split('-')[1:]), bob, smt, lat, lng, 0.0])

df = df['rainmm'].shift(1)
            
df = pd.DataFrame(np.array(data, dtype=float).reshape(27664,6), columns = ['date','bob','smt','lat','lng','rainmm'])
df = df.drop_duplicates()

# normalisation 
data = df.values #returns a numpy array
x = data[:,:-1]
y = data[:,-1]

#min_max_scaler = preprocessing.MinMaxScaler()
#x = min_max_scaler.fit_transform(x)

In [36]:
# creating lag

In [82]:
# model training 
params = {'n_estimators': 500,
          'max_depth': 13,
          'min_samples_split': 5,
          'learning_rate': 0.01,
          'loss': 'ls'}

reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(x, y)

GradientBoostingRegressor(learning_rate=0.01, max_depth=13, min_samples_split=5,
                          n_estimators=500)

In [81]:
mse = mean_squared_error(y, reg.predict(x))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))

The mean squared error (MSE) on test set: 21.8175


In [72]:
reg.predict(x)

array([-0.01821496, -0.06748218, -0.01224248, ...,  0.16235412,
        0.37266203,  0.0244207 ])

In [83]:
from joblib import dump, load

dump(reg, 'static/gboost.joblib')

['static/gboost.joblib']

In [74]:
date='2021-02-01'
bob=1
smt=1

reg = load('static/gboost.joblib')
stationFile = 'static/thailand_metstation.csv'
provinces = pd.read_csv(stationFile, dtype={"province": str, "region":str, "stncode":str})

rainPredict = [0] * len(provinces)
x_test = []
for i, p in provinces.iterrows():
    lat = p.lat 
    lng = p.lng
    pred = reg.predict(np.array([''.join(date.split('-')[1:]), bob, smt, lat, lng]).reshape(1,-1))[0]
    if pred < 0:
        rainPredict[i] = 0
    else:
        rainPredict[i] = pred

rainPredict

[4.017584761322311,
 0,
 1.1553557283881621,
 27.727729717168636,
 10.409966946520061,
 6.547492355874908,
 8.899371088250671,
 8.327206086341034,
 47.40919648179861,
 0.3504317694389019,
 0.2276845261160947,
 11.441439099623752,
 8.78076001746648,
 4.017584761322311,
 4.017584761322311,
 0.5353130296081874,
 2.410767771403711,
 8.245599888666263,
 0.8845572899627272,
 3.545339693781684,
 3.545339693781684,
 2.54677955541033,
 32.96872615715629,
 19.8060315008078,
 42.63377413882235,
 0,
 2.1836438110444667,
 0,
 32.059081969400026,
 8.17822035342323,
 0,
 2.4751694762623244,
 8.9855354482419,
 0.2801746830050911,
 1.5959449344802474,
 5.071591496301955,
 0.14325357678013476,
 2.368950923894357,
 0,
 6.413655413752093,
 4.636307590366567,
 1.6314731490404522,
 3.620886675566674,
 2.4318230841184643,
 1.9772132611644797,
 1.0563372605575154,
 3.911212697919157,
 0,
 1.8762331358192599,
 0,
 10.240683076798813,
 0,
 0,
 0,
 20.523824264122627,
 0.6647318343476063,
 4.1998812623626804,
 5