### Gaussian Kernel Regression


In [None]:
# load dependencies'
import pandas as pd
pd.set_option('display.max_columns', None)# pandas show all columns of table instead of restricted#
pd.options.mode.chained_assignment = None  # default='warn' surpresses warnings at spatial distribution
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import numpy as np
import math
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig
from envirocar import correction as correct
from envirocar import inspection as inspect
from envirocar import manipulation as manipulate
from envirocar import GKR


# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))



# Gaussian Kernel Regression
# class GKR:
    
#     def __init__(self, x, y, b):
#         self.x = x
#         self.y = y
#         self.b = b
    
#     '''Implement the Gaussian Kernel'''
#     def gaussian_kernel(self, z):
#         return (1/math.sqrt(2*math.pi))*math.exp(-0.5*z**2)
    
#     '''Calculate weights and return prediction'''
#     def predict(self, X):
#         kernels = [self.gaussian_kernel((xi-X)/self.b) for xi in self.x]
#         weights = [len(self.x) * (kernel/np.sum(kernels)) for kernel in kernels]
#         return np.dot(weights, self.y)/len(self.x)
 
    
#     def visualize_kernels(self, precision):
#         plt.figure(figsize = (10,5))
#         for xi in self.x:
#             x_normal = np.linspace(xi - 3*self.b, xi + 3*self.b, precision)
#             y_normal = stats.norm.pdf(x_normal, xi, self.b)
#             plt.plot(x_normal, y_normal)#, label='Kernel at xi=' + str(xi))
            
#         plt.ylabel('Kernel Weights wi')
#         plt.xlabel('x')
#         #plt.legend()
    
#     def visualize_predictions(self, precision, X):
#         plt.figure(figsize = (10,5))
#         max_y = 0
#         for xi in self.x:
#             x_normal = np.linspace(xi - 3*self.b, xi + 3*self.b, precision)
#             y_normal = stats.norm.pdf(x_normal, xi, self.b)
#             max_y = max(max(y_normal), max_y)
#             plt.plot(x_normal, y_normal, label='Kernel at xi=' + str(xi))
            
#         plt.plot([X,X], [0, max_y], 'k-', lw=1,dashes=[2, 2])
#         plt.ylabel('Kernel Weights wi')
#         plt.xlabel('x')
#         #plt.legend()


In [None]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])

# issue a query
track_df = track_api.get_tracks(bbox=bbox, num_results=20) 

In [None]:
correct.drop_dublicates(track_df)

In [None]:
# take only numerical variables
track_df_numeric = manipulate.get_numerical(track_df)

Inspect missing values to choose a variable which has many missing values. 
We will try to impute CO2 Emission (GPS-based).value

In [None]:
#missingValues=inspect.sum_missing_values(track_df_numeric)
missingValues=inspect.missing_values_per_variable(track_df_numeric, dropCol=True)
missingValues

Just to get an impression, chose the variable which has the strongest parametric relationship with CO2 Emission (GPS-based).value
here it seems to be he Speed.value
So we will try to impute CO2 Emission (GPS-based).value based on Speed.value

In [None]:
allCoeffs, very_strong, strong, moderate, weak = inspect.get_classified_correlations(track_df_numeric, 'spearman')
allCoeffs.loc[(allCoeffs['column'] == 'Consumption (GPS-based).value')]

In [None]:
# get impression of the two variables you want to relate to in order to get 
relation = track_df[["track.id","Speed.value", "CO2 Emission (GPS-based).value"]]
correct.flag_outlier_in_sample(relation, dropOutlierColumn=True, setOutlierToNan=True, dropFlag=True)
relation

In [None]:
fig = px.scatter(relation, x="Speed.value", y="CO2 Emission (GPS-based).value")
fig.show()

In [None]:
# drop all rows which contain NaN
relation2 = relation.dropna()
relation2.reset_index(drop=True, inplace=True)
relation2

In [None]:
inspect.plot_linear_regression(relation2["Speed.value"], relation2["CO2 Emission (GPS-based).value"])

In [None]:
gaussianKernelRegression=GKR(relation2['Speed.value'],relation2['CO2 Emission (GPS-based).value'], 10)
gaussianKernelRegression.visualize_kernels(100)

In [None]:
# Predict single value
gaussianKernelRegression.predict(15.978929817676544)

In [None]:
predictedDF=[]
for i in relation2['Speed.value'].index:
    df = relation2['Speed.value'].at[i]
    #print(df)
    value = gaussianKernelRegression.predict(df)
    #print(i, df, value)
    predictedDF.append({'Speed.value':df, 'predicted CO2 Emission (GPS-based).value': value})
predictedDF=pd.DataFrame(predictedDF)

In [None]:
predictedDF

In [None]:
y = np.c_[relation2["CO2 Emission (GPS-based).value"]]
y_predicted=np.c_[predictedDF["predicted CO2 Emission (GPS-based).value"]]

rmse_n = math.sqrt(mean_squared_error(y, y_predicted))
rmse_n

As this is an error of 3.5, it turns out, that Speed value in this form may not be a good single predictor to predict CO2 Comsumption. However, this was a rough analysis without further cleaning or transforming the data which may affect the result.