In [24]:
explanatory_cols = ['DOY', 'lat', 'lon', 'season', 'site', 'vegetation', 'year']
data_cols = ['P_ERA', 'TA_ERA', 'PA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'WS_ERA', 'VPD_ERA']
igbp_cols = [col for col in df.columns if col.startswith('igbp')]
regression_cols = data_cols + igbp_cols + [y_variable]


## TODO move this to a different notebook.

To speed up the rest of the computations, we'll take a sample (10%) of the observations. We'll also remove some variables that we don't want to use in the linear regression.

In [25]:
df_sample = df.sample(frac=0.10)

explanatory_df = df_sample[explanatory_cols]
regression_df = df_sample[regression_cols]
regression_df = regression_df.rename(columns={y_variable:'y'})

print("{} observations and {} variables".format(*regression_df.shape))
print("Generating a prediction with these variables: \n  {}".format(
    "\n  ".join(list(
        regression_df.columns
    ))
))

51611 observations and 23 variables
Generating a prediction with these variables: 
  P_ERA
  TA_ERA
  PA_ERA
  SW_IN_ERA
  LW_IN_ERA
  WS_ERA
  VPD_ERA
  igbp_BSV
  igbp_CRO
  igbp_CSH
  igbp_DBF
  igbp_DNF
  igbp_EBF
  igbp_ENF
  igbp_GRA
  igbp_MF
  igbp_OSH
  igbp_SAV
  igbp_SNO
  igbp_WAT
  igbp_WET
  igbp_WSA
  y


We'll use these variables to predict y. Note the exclusion of lat/lon and day of year from the variables that we are using. 

## Fluxnet Structure Exploration

Linear models work well *at one site* but this is confounded by:

* lat/lon
* day of year
* environment type

We want to explore the relationship between the linear model an these site identifiers. So basically if we are able to reduce the dimensionality of the variables and still see a relationship between the reduced dimensions and these site characteristics, then we stand a good chance of being able to create a reasonable model. We'll use [umap](https://github.com/lmcinnes/umap) - Uniform Manifold Approximation and Projection.

In [27]:
import umap
reduct = umap.UMAP(verbose=True, n_epochs=None)#, n_neighbors=30)

In [28]:
reduct.fit(regression_df.values)

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=15, negative_sample_rate=5, random_state=None,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=True)
Construct fuzzy simplicial set
	 0  /  16
	 1  /  16
	 2  /  16
Construct embedding
	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs


UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
   learning_rate=1.0, local_connectivity=1.0, metric='euclidean',
   metric_kwds=None, min_dist=0.1, n_components=2, n_epochs=None,
   n_neighbors=15, negative_sample_rate=5, random_state=None,
   repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
   target_metric='categorical', target_metric_kwds=None,
   target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
   transform_seed=42, verbose=True)

In [29]:
embedding = reduct.embedding_
embedding

array([[ -7.250101 ,   5.139064 ],
       [  7.202752 ,  -6.0317073],
       [  6.5984445,   6.518487 ],
       ...,
       [  1.5203034, -12.884718 ],
       [  3.4749465,   8.07705  ],
       [ -6.2676167,   8.975188 ]], dtype=float32)

In [30]:
umapped_df = explanatory_df.join(pd.DataFrame(embedding, index=df_sample.index, columns=['x0', 'x1']))

We can explore these reduced dimensions by coloring a scatter plot according to different variables that we believe should have structure in this space.

In [31]:
from holoviews.operation.datashader import rasterize, datashade
from datashader import transfer_functions as tf, reductions as rd

We believe that `lat` will be strongly correlated with the new dimensions:

In [35]:
by = 'lat'
title = 'Observations colored by {}'

scat = hv.Scatter(umapped_df, kdims=['x0', 'x1'])
p = scat.options(color_index=by, cmap='viridis', colorbar=True, 
                 alpha=0.2, size=1, width=350).relabel(title.format(by))

p + rasterize(p, x_sampling=1, y_sampling=1, aggregator=rd.std(by))\
        .options(colorbar=True, width=350).relabel('Aggregated by standard deviation')

We believe that `lon` on the other hand will not show a strong relationship.

In [36]:
by = 'lon'

p = scat.options(color_index=by, cmap='viridis',colorbar=True, 
                 alpha=0.2, size=1, width=350).relabel(title.format(by))

p + rasterize(p, x_sampling=1, y_sampling=1, aggregator=rd.std(by))\
        .options(colorbar=True, width=350).relabel('Aggregated by standard deviation')

Day of year should show a relationship. Note that aggregating by day of year is a little tricky since 366 == 0. So we'll use a circular colormap instead.

In [37]:
import colorcet as cc

by = 'DOY'

scat.options(color_index=by, cmap=cc.cm['cyclic_mrybm_35_75_c68'], 
             alpha=0.2, height=500, width=550, colorbar=True).relabel(title.format(by))

Similarly season should show an even stronger relationship since we used lat and day of year to set it.

In [38]:
by = 'season'
title = 'Observation counts grouped by {}'

datashade(scat.groupby(by), x_sampling=1, y_sampling=1).layout().relabel(title.format(by)).cols(2)

### Taking a closer look at vegetation

Since we included vegetation in our variables, we would expect the relationship between it and the reduced dimensions to be strong.

In [39]:
by = 'vegetation'
datashade(scat.groupby(by), x_sampling=1, y_sampling=1).layout().relabel(title.format(by)).cols(3)