In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeavePOut, RepeatedKFold, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler # robust_scale
from sklearn.svm import SVR

In [2]:
raw = pd.read_csv('../data/training_final_latcorr.csv')

In [3]:
co2_raw_cols = [
    'CO2 (mg C m¯² d¯¹)',
    'Age', 
    'org_c', 
    'temp_annual_avg',
    'temp_diff_summer_winter_lc', 
    'NDVI_annual_avg', 
    'npp_annual_avg', 
    'erosion',
    'precip'
]

clean = raw[co2_raw_cols].copy()

print("raw.shape   =", raw.shape)
print("clean.shape =", clean.shape)

raw.shape   = (154, 44)
clean.shape = (154, 9)


In [4]:
clean['log_co2_emissions'] = np.log(clean['CO2 (mg C m¯² d¯¹)'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
reduced = clean.dropna()
reduced.shape

(120, 10)

In [6]:
#target = 'CO2 (mg C m¯² d¯¹)'
target = 'log_co2_emissions'

features = ['Age', 'org_c', 'temp_annual_avg', 'temp_diff_summer_winter_lc', 
            'NDVI_annual_avg', 'npp_annual_avg', 'erosion', 'precip']

X = reduced[features]
y = reduced[target]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [8]:
print("X_train.shape =", X_train.shape)
print("X_test.shape  =", X_test.shape)
print("y_train.shape =", y_train.shape)
print("y_test.shape  =", y_test.shape)

X_train.shape = (90, 8)
X_test.shape  = (30, 8)
y_train.shape = (90,)
y_test.shape  = (30,)


In [9]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knr', KNeighborsRegressor())
])

pipeline.fit(X_train, y_train)

print("Train score =", pipeline.score(X_train, y_train))
print("Test score  =", pipeline.score(X_test, y_test))

Train score = 0.6089452371310761
Test score  = 0.49710373213557973


In [10]:
parameters = {
    'knr__n_neighbors': range(1, round(len(y_train) / 2)),
    'knr__weights': ['uniform', 'distance']
}

my_cv = RepeatedKFold(n_splits=2, n_repeats=10, random_state=0)

grid_search = GridSearchCV(pipeline, parameters, cv=my_cv, n_jobs=-1, scoring='r2') 

grid_search.fit(X_train, y_train)

print("Best score: %0.3f" % grid_search.best_score_)
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

grid_search.best_estimator_.score(X_test, y_test)

Best score: 0.245
	knr__n_neighbors: 5
	knr__weights: 'distance'


0.6083936912199917