In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('data/training_final_latcorr.csv')

In [None]:
pd.set_option('display.max_columns', None)
df

In [None]:
# drop columns we're not using
df['kWh'] = df['Area_km2'] / df['Area / Electricity']
df = df.drop(['Longitude', 'CO2 (g/kWh)', 'CH4 (g/kWh)', 'Area / Electricity', 'Name', 'field_1',
              'temp_spring_avg', 'temp_summer_avg', 'temp_fall_avg', 'temp_winter_avg', 
              'NDVI_spring_avg', 'NDVI_summer_avg', 'NDVI_fall_avg', 'NDVI_winter_avg', 
              'npp_spring_avg' , 'npp_summer_avg' , 'npp_fall_avg' , 'npp_winter_avg'], axis=1)

In [None]:
df = df.drop(['temp_spring_avg_lc', 'temp_summer_avg_lc', 'temp_fall_avg_lc', 'temp_winter_avg_lc', 
              'NDVI_spring_avg_lc', 'NDVI_summer_avg_lc', 'NDVI_fall_avg_lc', 'NDVI_winter_avg_lc', 
              'npp_spring_avg_lc' , 'npp_summer_avg_lc' , 'npp_fall_avg_lc' , 'npp_winter_avg_lc'], axis=1)

In [None]:
# rename CO2 and CH4 columns
df.rename(columns = {'CH4 (mg C m-2 d-1)':'CH4', 'CO2 (mg C m¯² d¯¹)':'CO2'}, inplace = True) 

In [None]:
ch4 = df.drop(['CO2'], axis=1)
ch4 = ch4[ch4['CH4'].notna()]

In [None]:
ch4.shape

In [None]:
for column in ch4:
    print(column, ':',  ch4[column].isna().sum())

In [None]:
from sklearn.impute import KNNImputer
model_impute = KNNImputer(n_neighbors=int(np.sqrt(ch4.shape[0])))
ch4_imputed = model_impute.fit_transform(ch4)

In [None]:
ch4_imputed = pd.DataFrame(columns=ch4.columns, data=ch4_imputed)

In [None]:
for column in ch4_imputed:
    print(column, ':',  ch4_imputed[column].isna().sum())

## RF for CH4

In [None]:
X = ch4_imputed.drop('CH4', axis=1)
Y = ch4_imputed['CH4']

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
from sklearn import ensemble

randomForest = ensemble.RandomForestRegressor()
randomForest.set_params(random_state=42, n_estimators=100, max_features=5)
randomForest.fit(x_train, y_train)
randomForest.score(x_train, y_train)

In [None]:
print("The training error is: %.5f" % (1 - randomForest.score(x_train, y_train)))
print("The test     error is: %.5f" % (1 - randomForest.score(x_test, y_test)))

In [None]:
pd.DataFrame(index=x_train.columns, data=randomForest.feature_importances_*100, columns=['Importance']).sort_values(by='Importance', ascending=True).head(15).plot(kind='barh')


In [None]:
from sklearn.model_selection import GridSearchCV

grid_values = {
    "n_estimators": range(200,500,50),
    "max_depth" : range(7,16),
    "max_features" : range(7,16)
}

grid_search_forest = GridSearchCV(randomForest, grid_values, scoring='r2', cv=2, n_jobs=-1)
%time grid_search_forest.fit(x_train, y_train)

In [None]:
grid_search_forest.best_params_

In [None]:
grid_search_forest.best_score_