In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
#df = pd.read_csv('../elena/barros_2011_training.csv')
df = pd.read_csv('../elena/data/training_final_latcorr.csv')

## Clean / Reverse Engineer Features
- Electricity generated (kWh)
- Drop redundant columns
- Drop non-lc columns

In [3]:
df['kWh'] = df['Area_km2'] / df['Area / Electricity']
df = df.drop(['Longitude', 'Latitude','CO2 (g/kWh)', 'CH4 (g/kWh)', 'Area / Electricity', 'Name', 'field_1',
              'temp_spring_avg', 'temp_summer_avg', 'temp_fall_avg', 'temp_winter_avg', 
              'NDVI_spring_avg', 'NDVI_summer_avg', 'NDVI_fall_avg', 'NDVI_winter_avg', 
              'npp_spring_avg' , 'npp_summer_avg' , 'npp_fall_avg' , 'npp_winter_avg'], axis=1)

In [4]:
df.columns

Index(['CO2 (mg C m¯² d¯¹)', 'CH4 (mg C m-2 d-1)', 'Area_km2', 'Age',
       'Volume_km3', 'Areakm2_div_Volkm3', 'org_c', 'temp_annual_avg',
       'temp_spring_avg_lc', 'temp_summer_avg_lc', 'temp_fall_avg_lc',
       'temp_winter_avg_lc', 'NDVI_annual_avg', 'NDVI_spring_avg_lc',
       'NDVI_summer_avg_lc', 'NDVI_fall_avg_lc', 'NDVI_winter_avg_lc',
       'npp_annual_avg', 'npp_spring_avg_lc', 'npp_summer_avg_lc',
       'npp_fall_avg_lc', 'npp_winter_avg_lc', 'erosion', 'kWh'],
      dtype='object')

In [5]:
df.shape

(154, 24)

In [None]:
#create co2 df

In [6]:
co2 = df.drop(['CH4 (mg C m-2 d-1)'], axis=1)
co2 = co2[co2['CO2 (mg C m¯² d¯¹)'].notna()]

### Pre-processing

#### knn impute missing values

In [7]:
from sklearn.impute import KNNImputer
model_impute = KNNImputer(n_neighbors=int(np.sqrt(co2.shape[0])))
co2_imputed = model_impute.fit_transform(co2)

In [8]:
co2_imputed = pd.DataFrame(columns=co2.columns, data=co2_imputed)

In [9]:
#check
for column in co2_imputed:
    print(column, ':',  co2_imputed[column].isna().sum())

CO2 (mg C m¯² d¯¹) : 0
Area_km2 : 0
Age : 0
Volume_km3 : 0
Areakm2_div_Volkm3 : 0
org_c : 0
temp_annual_avg : 0
temp_spring_avg_lc : 0
temp_summer_avg_lc : 0
temp_fall_avg_lc : 0
temp_winter_avg_lc : 0
NDVI_annual_avg : 0
NDVI_spring_avg_lc : 0
NDVI_summer_avg_lc : 0
NDVI_fall_avg_lc : 0
NDVI_winter_avg_lc : 0
npp_annual_avg : 0
npp_spring_avg_lc : 0
npp_summer_avg_lc : 0
npp_fall_avg_lc : 0
npp_winter_avg_lc : 0
erosion : 0
kWh : 0


#### scaling features

In [10]:
from sklearn.preprocessing import StandardScaler
data = co2_imputed.drop(['CO2 (mg C m¯² d¯¹)'], axis=1).copy()
scaler = StandardScaler()
co2_scaled = pd.DataFrame(scaler.fit_transform(data),columns=co2.columns[1:])

In [11]:
co2_scaled.head()

Unnamed: 0,Area_km2,Age,Volume_km3,Areakm2_div_Volkm3,org_c,temp_annual_avg,temp_spring_avg_lc,temp_summer_avg_lc,temp_fall_avg_lc,temp_winter_avg_lc,...,NDVI_summer_avg_lc,NDVI_fall_avg_lc,NDVI_winter_avg_lc,npp_annual_avg,npp_spring_avg_lc,npp_summer_avg_lc,npp_fall_avg_lc,npp_winter_avg_lc,erosion,kWh
0,-0.637106,-0.91379,-0.488456,-0.363661,-0.555711,0.684757,0.777851,0.513276,0.615875,0.749077,...,0.739486,0.548385,0.941081,3.695553,3.309828,1.087479,3.217878,2.141876,1.884674,-0.022636
1,-0.637106,-0.91379,-0.488456,-0.363661,-0.555711,0.684757,0.777851,0.513276,0.615875,0.749077,...,0.739486,0.548385,0.941081,3.695553,3.309828,1.087479,3.217878,2.141876,1.884674,-0.022636
2,1.101332,-0.87182,4.37729,-0.42603,-0.657505,1.069519,1.054485,1.342929,1.045985,0.972523,...,1.251663,1.331399,1.399686,-0.34727,-0.013138,-1.220722,-0.072482,0.495821,3.018306,9.01033
3,-0.708398,2.527707,-0.584999,-0.242877,-0.274721,0.998409,0.964513,0.939314,1.014878,1.020211,...,0.708128,1.0368,0.992513,1.947225,1.844059,0.177579,2.455805,1.951695,-0.348399,-0.40907
4,-0.305322,0.303325,-0.449839,-0.165231,-0.830814,1.043744,1.06977,1.029762,1.007827,1.030781,...,-0.870217,-0.451701,0.071016,1.282326,0.361633,-0.357656,2.097183,1.368034,0.443458,-0.466019


## Support Vector Machine

In [59]:
X = co2_scaled
y = co2['CO2 (mg C m¯² d¯¹)']

In [60]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
from sklearn import svm

regr = svm.SVR()
regr.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [65]:
print('train score:', regr.score(x_train, y_train))
print('test  score:', regr.score(x_test, y_test))

train score: -0.069229035617141
test  score: 0.006544677233467189


### linear kernel

In [67]:
regr2 = svm.SVR(kernel='linear')
regr2.fit(x_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [68]:
print('train score:', regr2.score(x_train, y_train))
print('test  score:', regr2.score(x_test, y_test))

train score: 0.12034293778309113
test  score: -0.04167438641993537


In [69]:
pd.DataFrame(columns=x_train.columns, data=regr2.coef_).T.sort_values(by=0)

Unnamed: 0,0
Age,-33.784747
npp_spring_avg_lc,-18.056804
Volume_km3,-15.942009
npp_summer_avg_lc,-15.745941
Areakm2_div_Volkm3,-12.205813
NDVI_spring_avg_lc,-3.844591
erosion,-1.091901
kWh,1.045439
NDVI_summer_avg_lc,1.152633
npp_annual_avg,1.522614


### scaling target variable:

In [75]:
y_train_scaled = scaler.fit_transform(np.array(y_train).reshape(-1,1))
y_test_scaled = scaler.fit_transform(np.array(y_test).reshape(-1,1))

In [133]:
regr_rbf_scaled = svm.SVR(kernel='rbf', C=20)
regr_rbf_scaled.fit(x_train, y_train_scaled)

  y = column_or_1d(y, warn=True)


SVR(C=20, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [134]:
print('train score:', regr_rbf_scaled.score(x_train, y_train_scaled))
print('test  score:', regr_rbf_scaled.score(x_test, y_test_scaled))

train score: 0.9035324038615531
test  score: 0.31581936686938217


### linear SVM & scaled target variable

In [121]:
regr4 = svm.SVR(kernel='linear')
regr4.fit(x_train, y_train_scaled)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [122]:
print('train score:', regr4.score(x_train, y_train_scaled))
print('test  score:', regr4.score(x_test, y_test_scaled))

train score: 0.36195698718903047
test  score: 0.15078547562485756


In [None]:
#linear kernel reduces accuracy, as expected
#scaling target variable improved r2...lets now test out different kernels and params

### ploynomial kernel

In [143]:
regr_poly = svm.SVR(kernel='poly', C=12, degree=3)
regr_poly.fit(x_train, y_train_scaled)

  y = column_or_1d(y, warn=True)


SVR(C=12, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [144]:
print('train score:', regr_poly.score(x_train, y_train_scaled))
print('test  score:', regr_poly.score(x_test, y_test_scaled))

train score: 0.8640441171460039
test  score: 0.305617410571498


In [None]:
# tuning C increases the accuracy, likely because of significant overlap.
# C in the range of 10-20 seems to be most accurate without overfitting

### sigmoid kernel

In [146]:
regr_sigmoid = svm.SVR(kernel='sigmoid')
regr_sigmoid.fit(x_train, y_train_scaled)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='sigmoid', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [147]:
print('train score:', regr_sigmoid.score(x_train, y_train_scaled))
print('test  score:', regr_sigmoid.score(x_test, y_test_scaled))

train score: -1.143397687281352
test  score: -2.086415757309359


In [None]:
#uhh wut