In [36]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as sm
import scipy.stats  as stats
import xgboost as xgb


datapath_aod = "../data_local/MOD08_E3.A2017105.006.2017118135856_AoD.npy"
datapath_vap = "../data_local/MOD08_E3.A2017105.006.2017118135856_Vapor.npy"

print("Loading data from: " + datapath_aod)
aod =  np.load(datapath_aod)
vap =  np.load(datapath_vap)

pred_datapath_aod =  "../data_local/MYD08_E3.A2017105.006.2017118135357_AoD.npy"
pred_datapath_vap = "../data_local/MYD08_E3.A2017105.006.2017118135357_Vapor.npy"

print("Loading data from: " + pred_datapath_aod)
pred_aod =  np.load(pred_datapath_aod)
pred_vap =  np.load(pred_datapath_vap)


ols_df  = pd.DataFrame({"vap" : vap.ravel(), "A" : aod[0,:,:].ravel()
                        , "B" : aod[1,:,:].ravel(),"C" : aod[2,:,:].ravel() })
pred_df = pd.DataFrame({"vap" : pred_vap.ravel(),"A" : pred_aod[0,:,:].ravel()
                       , "B" : pred_aod[1,:,:].ravel(),"C" : pred_aod[2,:,:].ravel() })

dtrain = xgb.DMatrix(ols_df[["A","B","C"]].values, label=ols_df["vap"])
dtarget = xgb.DMatrix(pred_df[["A","B","C"]].values, label=pred_df["vap"])

param = {'max_depth':8, 'eta':1, 'silent':1, 'objective':'reg:linear',
        'subsample':0.8, 'min_child_weight':5}
param['nthread'] = 4
param['eval_metric'] = 'mae'
evallist  = [(dtarget,'eval'), (dtrain,'train')]
num_round = 200
bst = xgb.train(param,  dtrain, num_round, evallist )
print(bst)


Loading data from: ../data_local/MOD08_E3.A2017105.006.2017118135856_AoD.npy
Loading data from: ../data_local/MYD08_E3.A2017105.006.2017118135357_AoD.npy
[0]	eval-mae:0.907156	train-mae:0.652314
[1]	eval-mae:0.900439	train-mae:0.640548
[2]	eval-mae:0.900786	train-mae:0.636656
[3]	eval-mae:0.893179	train-mae:0.626113
[4]	eval-mae:0.892232	train-mae:0.623672
[5]	eval-mae:0.892641	train-mae:0.620283
[6]	eval-mae:0.892912	train-mae:0.6184
[7]	eval-mae:0.894092	train-mae:0.614484
[8]	eval-mae:0.894246	train-mae:0.613258
[9]	eval-mae:0.893609	train-mae:0.609959
[10]	eval-mae:0.893534	train-mae:0.609907
[11]	eval-mae:0.894033	train-mae:0.608837
[12]	eval-mae:0.894729	train-mae:0.606733
[13]	eval-mae:0.895039	train-mae:0.604725
[14]	eval-mae:0.896727	train-mae:0.60239
[15]	eval-mae:0.896825	train-mae:0.602044
[16]	eval-mae:0.897454	train-mae:0.601249
[17]	eval-mae:0.89763	train-mae:0.600391
[18]	eval-mae:0.898241	train-mae:0.598672
[19]	eval-mae:0.898355	train-mae:0.598107
[20]	eval-mae:0.8992

[191]	eval-mae:0.952552	train-mae:0.499319
[192]	eval-mae:0.952019	train-mae:0.498663
[193]	eval-mae:0.952319	train-mae:0.498289
[194]	eval-mae:0.953083	train-mae:0.497751
[195]	eval-mae:0.953007	train-mae:0.497285
[196]	eval-mae:0.952577	train-mae:0.496881
[197]	eval-mae:0.952751	train-mae:0.496961
[198]	eval-mae:0.952768	train-mae:0.497145
[199]	eval-mae:0.953109	train-mae:0.496355
<xgboost.core.Booster object at 0x7fa8c0439590>


In [22]:
dtrain.feature_names

[u'A', u'B', u'C']

In [25]:
ols_df["C"].values


array([-9.99900047, -9.99900047, -9.99900047, ..., -9.99900047,
       -9.99900047, -9.99900047])

In [102]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as sm
import scipy.stats  as stats
import xgboost as xgb


datapath_aod = "../data_local/datatable"
print("Loading data from: " + datapath_aod)
dat =  pd.read_pickle(datapath_aod)
dat.train = dat.head( dat.shape[0]/90)
dat.oot = dat.tail( dat.shape[0]/10)
dtrain = xgb.DMatrix(dat.train[["aod","cloud","long","lat","year","day"]].values, label=dat.train["vapor"])
dtarget = xgb.DMatrix(dat.oot[["aod","cloud","long","lat","year","day"]].values, label=dat.oot["vapor"])

param = {'max_depth':15, 'eta':1, 'silent':1, 'objective':'reg:linear',
        'subsample':0.8, 'min_child_weight':5}
param['nthread'] = 4
param['eval_metric'] = 'rmse'
evallist  = [(dtarget,'eval'), (dtrain,'train')]
num_round = 500

bst = xgb.train(param,  dtrain, num_round, evallist,verbose_eval=False )

dtrain_bench = xgb.DMatrix(dat.train[["cloud","long","lat","year","day"]].values, label=dat.train["vapor"])
dtarget_bench = xgb.DMatrix(dat.oot[["cloud","long","lat","year","day"]].values, label=dat.oot["vapor"])
evallist  = [(dtarget_bench,'eval'), (dtrain_bench,'train')]
bst_eval = xgb.train(param,  dtrain_bench, num_round, evallist,verbose_eval=False )

print("Eval Model",bst.eval(dtarget))
print("Bench Model",bst_eval.eval(dtarget_bench))
print("NULL model", np.mean(np.abs(dat["vapor"] - dat["vapor"].mean())))

Loading data from: ../data_local/datatable
('Eval Model', '[0]\teval-rmse:1.683164')
('Bench Model', '[0]\teval-rmse:1.659055')
('NULL model', 1.2789737659928424)


In [95]:
1.131383 - 1.130546

0.0008369999999999767