### Gaussian Kernel Regression


In [1]:
# load dependencies'
import pandas as pd
pd.set_option('display.max_columns', None)# pandas show all columns of table instead of restricted#
pd.options.mode.chained_assignment = None  # default='warn' surpresses warnings at spatial distribution
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import numpy as np
import math
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig
from envirocar import correction as correct
from envirocar import inspection as inspect
from envirocar import manipulation as manipulate
from envirocar import GKR


# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))


In [3]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])

# issue a query
track_df = track_api.get_tracks(bbox=bbox, num_results=2) 

Unnamed: 0,id,time,geometry,Engine Load.value,Engine Load.unit,Calculated MAF.value,Calculated MAF.unit,Speed.value,Speed.unit,CO2.value,CO2.unit,Intake Pressure.value,Intake Pressure.unit,Rpm.value,Rpm.unit,Intake Temperature.value,Intake Temperature.unit,Consumption (GPS-based).value,Consumption (GPS-based).unit,GPS Altitude.value,GPS Altitude.unit,Throttle Position.value,Throttle Position.unit,GPS Bearing.value,GPS Bearing.unit,Consumption.value,Consumption.unit,GPS Accuracy.value,GPS Accuracy.unit,CO2 Emission (GPS-based).value,CO2 Emission (GPS-based).unit,GPS Speed.value,GPS Speed.unit,track.id,track.length,track.begin,track.end,sensor.type,sensor.engineDisplacement,sensor.model,sensor.id,sensor.fuelType,sensor.constructionYear,sensor.manufacturer
0,5f0ef89c00375c5a2641ef86,2020-07-15T12:37:03+00:00,POINT (7.57939 51.96766),30.459892,%,3.113889,g/s,15.97893,km/h,2.40547,kg/h,29.667201,kPa,748.952252,u/min,26.0,c,0.936199,l/h,115.671012,m,13.0,%,136.590329,deg,1.023604,l/h,6.0,%,2.200068,kg/h,17.50266,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
1,5f0ef89c00375c5a2641ef88,2020-07-15T12:37:09+00:00,POINT (7.57955 51.96757),49.230105,%,9.778811,g/s,11.134565,km/h,7.554102,kg/h,47.232322,kPa,1475.604745,u/min,25.652838,c,0.937457,l/h,115.280639,m,16.642229,%,134.479803,deg,3.214511,l/h,6.0,%,2.203025,kg/h,10.324164,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
2,5f0ef89c00375c5a2641ef89,2020-07-15T12:37:14+00:00,POINT (7.57988 51.96740),78.649652,%,25.066406,g/s,33.97633,km/h,19.363722,kg/h,74.040426,kPa,2397.395931,u/min,23.728013,c,5.102906,l/h,114.613231,m,23.862069,%,123.313954,deg,8.239881,l/h,6.193485,%,11.99183,kg/h,30.967132,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
3,5f0ef89c00375c5a2641ef8a,2020-07-15T12:37:19+00:00,POINT (7.58049 51.96715),31.2004,%,7.419664,g/s,39.0,km/h,5.731668,kg/h,30.075758,kPa,1748.565672,u/min,24.0,c,2.280491,l/h,113.835218,m,15.351261,%,125.03578,deg,2.439007,l/h,6.0,%,5.359154,kg/h,36.898346,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
4,5f0ef89c00375c5a2641ef8b,2020-07-15T12:37:24+00:00,POINT (7.58107 51.96682),29.735773,%,3.106679,g/s,34.321667,km/h,2.3999,kg/h,29.0,kPa,761.854074,u/min,25.0,c,0.946257,l/h,113.502384,m,13.0,%,133.482068,deg,1.021234,l/h,7.489919,%,2.223704,kg/h,34.447545,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
5,5f0ef89c00375c5a2641ef8c,2020-07-15T12:37:29+00:00,POINT (7.58154 51.96654),62.537844,%,9.954607,g/s,30.291912,km/h,7.689903,kg/h,52.25488,kPa,1359.329813,u/min,26.0,c,0.906864,l/h,113.527143,m,21.752623,%,135.315739,deg,3.272299,l/h,6.0,%,2.131129,kg/h,29.708616,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
6,5f0ef89c00375c5a2641ef8d,2020-07-15T12:37:34+00:00,POINT (7.58205 51.96621),35.909973,%,8.377679,g/s,40.000001,km/h,6.471731,kg/h,35.3082,kPa,1681.753849,u/min,24.0,c,4.515173,l/h,113.791613,m,16.25261,%,136.94889,deg,2.753928,l/h,6.0,%,10.610658,kg/h,38.087859,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
7,5f0ef89c00375c5a2641ef8e,2020-07-15T12:37:39+00:00,POINT (7.58258 51.96582),28.092692,%,3.479236,g/s,42.127695,km/h,2.687699,kg/h,28.612699,kPa,861.865287,u/min,24.0,c,2.548193,l/h,113.882165,m,13.479936,%,141.448571,deg,1.143702,l/h,6.0,%,5.988254,kg/h,42.070909,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
8,5f0ef89c00375c5a2641ef8f,2020-07-15T12:37:44+00:00,POINT (7.58307 51.96541),31.254127,%,3.914605,g/s,37.932298,km/h,3.02402,kg/h,31.334737,kPa,891.434491,u/min,26.0,c,0.896855,l/h,114.05582,m,13.702211,%,144.598237,deg,1.286817,l/h,6.0,%,2.107609,kg/h,38.032289,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz
9,5f0ef89c00375c5a2641ef90,2020-07-15T12:37:49+00:00,POINT (7.58373 51.96520),39.30876,%,8.327728,g/s,44.0,km/h,6.433144,kg/h,38.888603,kPa,1517.813778,u/min,24.0,c,4.167594,l/h,114.421844,m,14.572755,%,88.174089,deg,2.737508,l/h,8.0,%,9.793846,kg/h,37.130406,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz


In [None]:
correct.drop_dublicates(track_df)

In [None]:
# take only numerical variables
track_df_numeric = manipulate.get_numerical(track_df)

Inspect missing values to choose a variable which has many missing values. 
We will try to impute CO2 Emission (GPS-based).value

In [None]:
#missingValues=inspect.sum_missing_values(track_df_numeric)
missingValues=inspect.missing_values_per_variable(track_df_numeric, dropCol=True)
missingValues

Just to get an impression, chose the variable which has the strongest parametric relationship with CO2 Emission (GPS-based).value
here it seems to be he Speed.value
So we will try to impute CO2 Emission (GPS-based).value based on Speed.value

In [None]:
allCoeffs, very_strong, strong, moderate, weak = inspect.get_classified_correlations(track_df_numeric, 'spearman')
allCoeffs.loc[(allCoeffs['column'] == 'Consumption (GPS-based).value')]

In [None]:
# get impression of the two variables you want to relate to in order to get 
relation = track_df[["track.id","Speed.value", "CO2 Emission (GPS-based).value"]]
correct.flag_outlier_in_sample(relation, dropOutlierColumn=True, setOutlierToNan=True, dropFlag=True)
relation

In [None]:
fig = px.scatter(relation, x="Speed.value", y="CO2 Emission (GPS-based).value")
fig.show()

In [None]:
# drop all rows which contain NaN
relation2 = relation.dropna()
relation2.reset_index(drop=True, inplace=True)
relation2

In [None]:
inspect.plot_linear_regression(relation2["Speed.value"], relation2["CO2 Emission (GPS-based).value"])

In [None]:
gaussianKernelRegression=GKR(relation2['Speed.value'],relation2['CO2 Emission (GPS-based).value'], 10)
gaussianKernelRegression.visualize_kernels(100)

In [None]:
# Predict single value
gaussianKernelRegression.predict(15.978929817676544)

In [None]:
predictedDF=[]
for i in relation2['Speed.value'].index:
    df = relation2['Speed.value'].at[i]
    #print(df)
    value = gaussianKernelRegression.predict(df)
    #print(i, df, value)
    predictedDF.append({'Speed.value':df, 'predicted CO2 Emission (GPS-based).value': value})
predictedDF=pd.DataFrame(predictedDF)

In [None]:
predictedDF

In [None]:
y = np.c_[relation2["CO2 Emission (GPS-based).value"]]
y_predicted=np.c_[predictedDF["predicted CO2 Emission (GPS-based).value"]]

rmse_n = math.sqrt(mean_squared_error(y, y_predicted))
rmse_n

As this is an error of 3.5, it turns out, that Speed value in this form may not be a good single predictor to predict CO2 Comsumption. However, this was a rough analysis without further cleaning or transforming the data which may affect the result.