In [136]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [137]:
#df = pd.read_csv('../elena/barros_2011_training.csv')
df = pd.read_csv('../elena/data/training_final_latcorr.csv')

## Clean / Reverse Engineer Features
- Electricity generated (kWh)
- Drop redundant columns
- Drop non-lc columns

In [138]:
df['kWh'] = df['Area_km2'] / df['Area / Electricity']
df = df.drop(['Longitude', 'Latitude','CO2 (g/kWh)', 'CH4 (g/kWh)', 'Area / Electricity', 'Name', 'field_1',
              'temp_spring_avg', 'temp_summer_avg', 'temp_fall_avg', 'temp_winter_avg', 
              'NDVI_spring_avg', 'NDVI_summer_avg', 'NDVI_fall_avg', 'NDVI_winter_avg', 
              'npp_spring_avg' , 'npp_summer_avg' , 'npp_fall_avg' , 'npp_winter_avg'], axis=1)

In [139]:
df.columns

Index(['CO2 (mg C m¯² d¯¹)', 'CH4 (mg C m-2 d-1)', 'Area_km2', 'Age',
       'Volume_km3', 'Areakm2_div_Volkm3', 'org_c', 'temp_annual_avg',
       'temp_spring_avg_lc', 'temp_summer_avg_lc', 'temp_fall_avg_lc',
       'temp_winter_avg_lc', 'NDVI_annual_avg', 'NDVI_spring_avg_lc',
       'NDVI_summer_avg_lc', 'NDVI_fall_avg_lc', 'NDVI_winter_avg_lc',
       'npp_annual_avg', 'npp_spring_avg_lc', 'npp_summer_avg_lc',
       'npp_fall_avg_lc', 'npp_winter_avg_lc', 'erosion', 'kWh'],
      dtype='object')

In [140]:
df.shape

(154, 24)

In [None]:
#create co2 df

In [141]:
co2 = df.drop(['CH4 (mg C m-2 d-1)'], axis=1)
co2 = co2[co2['CO2 (mg C m¯² d¯¹)'].notna()]

## XGBoost

In [146]:
X = co2.drop('CO2 (mg C m¯² d¯¹)', axis=1)
Y = co2['CO2 (mg C m¯² d¯¹)']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=42)

In [147]:
import xgboost

In [148]:
xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(x_train, y_train)
y_predict = xgb_reg.predict(x_train)

In [149]:
print('train r2:', xgb_reg.score(x_train, y_train))
print('test r2 :', xgb_reg.score(x_test, y_test))

train r2: 0.9896469773364932
test r2 : -0.049770611473818116


In [150]:
xgb_reg.fit(x_train, y_train,
           eval_set=[(x_val, y_val)], early_stopping_rounds=2)

[0]	validation_0-rmse:720.45819
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:581.60626
[2]	validation_0-rmse:506.72189
[3]	validation_0-rmse:486.00159
[4]	validation_0-rmse:459.15906
[5]	validation_0-rmse:445.06494
[6]	validation_0-rmse:440.24454
[7]	validation_0-rmse:440.47867
[8]	validation_0-rmse:437.98999
[9]	validation_0-rmse:438.61774
[10]	validation_0-rmse:439.39511
Stopping. Best iteration:
[8]	validation_0-rmse:437.98999



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [151]:
print('train r2:', xgb_reg.score(x_train, y_train))
print('test r2 :', xgb_reg.score(x_test, y_test))

train r2: 0.9462169162814192
test r2 : 0.16952930228720975


#### comparing predictions to actual values
- are there patterns we can see where residuals are low or high?

In [70]:
predictions = pd.DataFrame(columns=['actual','predicted'], data=list(zip(y_test, xgb_reg.predict(x_test))))

In [78]:
comparing = x_test.copy().reset_index().drop(['index'], axis=1)
comparing['actual'] = predictions['actual']
comparing['predicted'] = predictions['predicted']

### scaling values

In [152]:
from sklearn.preprocessing import StandardScaler

data = co2.copy()
scaler = StandardScaler()
co2_scaled = pd.DataFrame(scaler.fit_transform(data),columns=co2.columns)

In [168]:
X = co2_scaled.drop('CO2 (mg C m¯² d¯¹)', axis=1)
Y = co2['CO2 (mg C m¯² d¯¹)']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

#x_train = xgboost.DMatrix(x_train)
#x_test = xgboost.DMatrix(x_test)

In [165]:
x_train.shape

(90, 22)

In [169]:
xgb_reg2 = xgboost.XGBRegressor()
xgb_reg2.fit(x_train, y_train, 
             eval_set=[(x_val, y_val)], 
             early_stopping_rounds=5)

[0]	validation_0-rmse:775.38696
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:638.68597
[2]	validation_0-rmse:587.67779
[3]	validation_0-rmse:533.64447
[4]	validation_0-rmse:507.27310
[5]	validation_0-rmse:483.58633
[6]	validation_0-rmse:475.38309
[7]	validation_0-rmse:467.63269
[8]	validation_0-rmse:462.73908
[9]	validation_0-rmse:461.95099
[10]	validation_0-rmse:461.06979
[11]	validation_0-rmse:462.17993
[12]	validation_0-rmse:465.94910
[13]	validation_0-rmse:465.09018
[14]	validation_0-rmse:465.47409
[15]	validation_0-rmse:468.98828
Stopping. Best iteration:
[10]	validation_0-rmse:461.06979



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [170]:
print('train r2:', xgb_reg2.score(x_train, y_train))
print('test  r2:', xgb_reg2.score(x_test, y_test))

train r2: 0.9646711432046045
test  r2: 0.36061999960760294


## grid search

In [188]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [222]:
from sklearn.model_selection import GridSearchCV

grid_values = {
    "n_estimators": range(0,50,10),
    "learning_rate" : np.linspace(.001,1,10)
}

grid_search_xgb = GridSearchCV(xgb_reg2, grid_values, scoring='r2', cv=2, n_jobs=-1)
%time grid_search_xgb.fit(x_train, y_train)

CPU times: user 359 ms, sys: 12.3 ms, total: 372 ms
Wall time: 699 ms


GridSearchCV(cv=2, error_score=nan,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    colsample_bylevel=1, colsample_bynode=1,
                                    colsample_bytree=1, gamma=0, gpu_id=-1,
                                    importance_type='gain',
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_delta_step=0,
                                    max_depth=6, min_child_weight=1,
                                    missing=nan, monotone_constraints='()',
                                    n_estimators=100, n_jobs...
                                    random_state=0, reg_alpha=0, reg_lambda=1,
                                    scale_pos_weight=1, subsample=1,
                                    tree_method='exact', validate_parameters=1,
                                    verbosity=None),
             iid='deprecated', n_jobs=-1,
    

In [223]:
grid_search_xgb.best_params_

{'learning_rate': 0.112, 'n_estimators': 20}

In [224]:
grid_search_xgb.best_score_

0.24784459905708656

In [225]:
xgb_best = xgboost.XGBRegressor(learning_rate=.112, n_estimators=20)
xgb_best.fit(x_train, y_train)
xgb_best.score(x_train, y_train)
xgb_best.score(x_test, y_test)

0.18643214998674895