In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, LeaveOneOut, LeavePOut, RepeatedKFold, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler # robust_scale
from sklearn.svm import SVR

In [5]:
raw = pd.read_csv('data/training_final_latcorr.csv')

In [6]:
co2_raw_cols = [
    'CO2 (mg C m¯² d¯¹)',
    'Age', 
    'org_c', 
    'temp_annual_avg',
    'temp_diff_summer_winter_lc', 
    'NDVI_annual_avg', 
    'npp_annual_avg', 
    'erosion',
    'precip'
]

clean = raw[co2_raw_cols].copy()

print("raw.shape   =", raw.shape)
print("clean.shape =", clean.shape)

raw.shape   = (154, 44)
clean.shape = (154, 9)


In [7]:
clean['log_co2_emissions'] = np.log(clean['CO2 (mg C m¯² d¯¹)'])


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
reduced = clean.dropna()
reduced.shape

(120, 10)

In [16]:
reduced.describe()


Unnamed: 0,CO2 (mg C m¯² d¯¹),Age,org_c,temp_annual_avg,temp_diff_summer_winter_lc,NDVI_annual_avg,npp_annual_avg,erosion,precip,log_co2_emissions
count,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0,120.0
mean,617.94,34.691667,71.589661,10.094683,16.87459,177.953472,97.432639,4.288679,636.758333,5.922503
std,626.375194,25.116132,41.278424,11.020708,12.1326,51.338768,28.637932,10.112684,290.828229,1.136253
min,12.3,1.0,6.547675,-5.253,-0.937332,93.416664,50.666668,0.000198,258.0,2.509599
25%,204.7,14.75,41.144838,1.176167,3.863999,136.75,77.916664,0.02974,391.0,5.321043
50%,383.85,28.5,59.661419,4.404,20.368001,169.5,93.333336,0.124527,585.0,5.950252
75%,836.175,46.25,90.360537,22.435501,29.009666,229.291668,115.333334,1.121719,798.0,6.728815
max,3800.0,95.0,177.533341,26.893333,33.606668,255.0,200.5,45.775703,1510.0,8.242756


In [17]:
#target = 'CO2 (mg C m¯² d¯¹)'
target = 'log_co2_emissions'

features = ['Age', 'org_c', 'temp_annual_avg', 'temp_diff_summer_winter_lc', 
            'NDVI_annual_avg', 'npp_annual_avg', 'erosion', 'precip']

X = reduced[features]
y = reduced[target]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [19]:
print("X_train.shape =", X_train.shape)
print("X_test.shape  =", X_test.shape)
print("y_train.shape =", y_train.shape)
print("y_test.shape  =", y_test.shape)

X_train.shape = (90, 8)
X_test.shape  = (30, 8)
y_train.shape = (90,)
y_test.shape  = (30,)


In [9]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knr', KNeighborsRegressor())
])

pipeline.fit(X_train, y_train)

print("Train score =", pipeline.score(X_train, y_train))
print("Test score  =", pipeline.score(X_test, y_test))

Train score = 0.6089452371310761
Test score  = 0.49710373213557973


In [11]:
parameters = {
    'knr__n_neighbors': range(1, round(len(y_train) / 2)),
    'knr__weights': ['uniform', 'distance']
}

my_cv = RepeatedKFold(n_splits=2, n_repeats=10, random_state=0)

best_co2_model = GridSearchCV(pipeline, parameters, cv=my_cv, n_jobs=-1, scoring='r2') 

best_co2_model.fit(X_train, y_train)

print("Best score: %0.3f" % best_co2_model.best_score_)
best_parameters = best_co2_model.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

best_co2_model.best_estimator_.score(X_test, y_test)

Best score: 0.245
	knr__n_neighbors: 5
	knr__weights: 'distance'


0.6083936912199917

## Use this model to predict CO2 emissions in USA dataset

In [56]:
present = pd.read_csv('../data_predict/merged_PRESENT.csv')
best_case = pd.read_csv('../data_predict/merged_2100ssp126.csv')
worst_case = pd.read_csv('../data_predict/merged_2100ssp585.csv')

In [57]:
for column in present[features]:
    print(column, ':',  present[features][column].isna().sum())

Age : 0
org_c : 0
temp_annual_avg : 0
temp_diff_summer_winter_lc : 0
NDVI_annual_avg : 0
npp_annual_avg : 33
erosion : 0
precip : 0


In [58]:
present['npp_annual_avg'] = present['npp_annual_avg'].fillna(present['npp_annual_avg'].mean())
best_case['npp_annual_avg'] = best_case['npp_annual_avg'].fillna(best_case['npp_annual_avg'].mean())
worst_case['npp_annual_avg'] = worst_case['npp_annual_avg'].fillna(worst_case['npp_annual_avg'].mean())

In [59]:
present['co2_emissions_logscale'] = best_co2_model.predict(present[features])
best_case['co2_emissions_logscale'] = best_co2_model.predict(best_case[features])
worst_case['co2_emissions_logscale'] = best_co2_model.predict(worst_case[features])

In [60]:
present['co2_emissions'] = 10**present['co2_emissions_logscale']
best_case['co2_emissions'] = 10**best_case['co2_emissions_logscale']
worst_case['co2_emissions'] = 10**worst_case['co2_emissions_logscale']

In [66]:
present.to_csv('../data_predict/merged_PRESENT_co2predicted.csv')
best_case.to_csv('../data_predict/merged_2100ssp126_co2predicted.csv')
worst_case.to_csv('../data_predict/merged_2100ssp585_co2predicted.csv')

In [70]:
present['co2_emissions'].median()

148721.69675618425

In [71]:
best_case['co2_emissions'].median()

168073.49382113252

In [72]:
worst_case['co2_emissions'].median()

201921.0311567986