In [1]:
# load dependencies'
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn' surpresses warnings at spatial distribution
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import numpy as np
import datetime
import plotly.express as px
import plotly.graph_objects as go

from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig

# pandas show all columns of table instead of restricted
pd.set_option('display.max_columns', None)

# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))



# ------------------------ Functions --------------------------



def show_units(df):
    '''
        Aim: 
            get an overview of the variables and corresponding units
        
        Keyword Arguments: 
            df {Geodataframe} -- point input
        
        Output: Matrix-like overview on variables an the relevant unit
    '''
    units = df.filter(like='.unit').columns
    for unit in units:
        if unit in df:
            print(df[unit].name, df[unit].iloc[0])
            


            
        
def plot_normality_with_qqplot(point_df, column):
    '''
        Aim: 
            create q-q plot to inspect normality of distribution of selected variable
        
    Keyword Arguments: 
        df {Geodataframe} -- points input
        column {str} -- variable name
        
        Output: Q-Q plot
    '''
    plot = stats.probplot(point_df[column],  dist="norm", plot=plt, fit = False)
    plt.title(column)
    plt.show()
    
    
    
    
def plot_tracks(points_df, column):
    """ 
    Aim: 
        Visualize phenomena of tracks as timeserie in Linechart, in which each line represents one single track
    
    Keyword Arguments: 
        df {Geodataframe} -- point input
        
    Returns:
        Chart is shown 
    
    """
    # Add datetime to data frame
    points_df['datetime'] = pd.to_datetime(points_df['time'])
    points_df.set_index(['track.id','datetime']).sort_index()
    points_df['index']=points_df.index
    fig = px.line(track_df, x="index", y=column, color="track.id",
                  line_group="track.id", hover_name="datetime")
    fig.update_traces(mode='lines+markers')
    fig.show()


    

    
def plot_distribution_s(points_df, column, column_gps = None):
    """ 
    Aim:
        Plot of two distributions in a single figure for visually comparing the shapes of the two distributions
    
    Keyword Arguments: 
        points {GeoDataFrame} -- the GeoDataFrame containing the measurements
        Column {str} -- the column name of measurement of interest,e.g. 'Speed.value'
        Column {str} -- the column name of measurement of same phenomena but measured based on GPS, e.g. 'GPS speed.value'
        
    Return:
        No Return, instead a plot is displayed
    """
    if column_gps is not None:
        sns.kdeplot(points_df[column], shade=True)
        sns.kdeplot(points_df[column_gps], shade=True)
    else:
        sns.kdeplot(points_df[column], shade=True)
    

    
    
    

def plot_point_values(points, value = None):
        """ This function is based on a function from the envirocar fork of the github user 'annaformaniuk'.
        
        Aim: 
            show points on a map

        Keyword Arguments:
            points {GeoDataFrame} -- points input
            value {string} -- column value to use for colouring

        Returns:
            No Return
        """

        points['lat'] = points['geometry'].apply(lambda coord: coord.y)
        points['lng'] = points['geometry'].apply(lambda coord: coord.x)
        
        if value is not None:
        # Visualizing points of the selected variable
            fig = px.scatter_mapbox(points, lat="lat", lon="lng", hover_data=["CO2.value"],
                                    color=value,
                                    color_continuous_scale=px.colors.sequential.Reds,
                                    title=value + " visualisation", zoom=8,
                                    hover_name="id")
        else:
            fig = px.scatter_mapbox(points, lat="lat", lon="lng", hover_data=["CO2.value"],
                                    title= " Spatial distribution or requested tracks", zoom=8,
                                    hover_name="id")
            
            
        fig.update_layout(mapbox_style="open-street-map",
                              margin={"r": 5, "t": 50, "l": 10, "b": 5})
        fig.show()


def plot_linear_regression(df, variableName1, variableName2):
    sns.regplot(x=variableName1, y=variableName2, data=df)
    



    
# Percentages correction
def percentages_correction(df):
    '''
        Aim: Set faulty percentages (percentages below 0 and above 100) to nan
        
        Input: Geodataframa
        
        Output: Dataframe with corrected percentages
    '''
    units = df.filter(like='.unit').columns
    values = df.filter(like='.value').columns
    for col in df:
        if col in units:
            if df[col].iloc[0]== "%":
                name = col.split(".")[0] + '.value'
                if name in values:
                    if any(df[name] < 0) or any(df[name] > 100):
                        nanBefore = df[name].isna().sum(axis=0)
                        df[name][df[name] < 0] = np.nan
                        df[name][df[name] > 100] = np.nan
                        nanAfter = df[name].isna().sum(axis=0)
                        corrected = nanAfter - nanBefore
                        print( 'Percentages ok : ', df[name].name, ' Count corrected:', corrected)        
                    else:
                        print('Percentages ok : ', df[name].name )
    




### Query tracks 

In [2]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])


# issue a query
track_df = track_api.get_tracks(bbox=bbox, num_results=25) 

In [None]:
# Remove Outlier
listNonNegative=['Speed.value', 'CO2.value', 'Rpm.value', 'Consumption (GPS-based).value',
                     'Consumption.value','CO2 Emission (GPS-based).value']
df_NonNegatives = track_df[listNonNegative]
print(df_NonNegatives.shape)

Q1 = df_NonNegatives.quantile(0.25)
Q3 = df_NonNegatives.quantile(0.75)
IQR = Q3 - Q1
#print(IQR)

#df_NonNegatives_out = df_NonNegatives[~((df_NonNegatives < (Q1 - 1.5 * IQR)) |(df_NonNegatives > (Q3 + 1.5 * IQR))).any(axis=1)]
#print(df_NonNegatives_out.shape)


#track_df.filter(listNonNegative)
#track_df[['Speed.value', 'CO2.value', 
#          'Rpm.value', 'Consumption (GPS-based).value',
#          'Consumption.value','CO2 Emission (GPS-based).value']][~((track_df[['Speed.value', 'CO2.value', 
#          'Rpm.value', 'Consumption (GPS-based).value',
#          'Consumption.value','CO2 Emission (GPS-based).value']] < (Q1 - 1.5 * IQR)) |(track_df[['Speed.value', 'CO2.value', 
#          'Rpm.value', 'Consumption (GPS-based).value',
#          'Consumption.value','CO2 Emission (GPS-based).value']] > (Q3 + 1.5 * IQR))).any(axis=1)]

df_NonNegatives[~((df_NonNegatives < (Q1 - 1.5 * IQR)) |(df_NonNegatives > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
#track_df.columns
listNonNegative=['Speed.value', 'CO2.value', 'Rpm.value', 'Consumption (GPS-based).value',
                     'Consumption.value','CO2 Emission (GPS-based).value']
#cols = list(track_df.columns)
df_NonNegatives = track_df[listNonNegative]
df_NonNegatives.shape
#ZScore_Speed = stats.zscore(a,nan_policy='omit')

#z = np.abs(stats.zscore(df_NonNegatives, nan_policy='omit' ))
#print(z)
#threshold = 3
#print(np.where(z > 3))
#print(z[57][3])

Q1 = df_NonNegatives.quantile(0.25)
Q3 = df_NonNegatives.quantile(0.75)
IQR = Q3 - Q1
#print(IQR)

#print(df_NonNegatives < (Q1 - 1.5 * IQR)) |(df_NonNegatives > (Q3 + 1.5 * IQR))



#track_df['_speed_zscore'] = (track_df['Speed.value'] - track_df['Speed.value'].mean())/track_df['Speed.value'].std(ddof=0)
#track_df['_speed_zscore']

In [None]:
#sns.boxplot(x='CO2.value', data=track_df)
#z = np.abs(stats.zscore(track_df["CO2.value"]))


## Spatial distribution
Here you can view how the point data you just requested are spatially distributed. Zoom in, and hover over the points. This will give you the data point id, exact coordinates and the CO2 value measured at that point.

In [None]:
plot_point_values(track_df)

You can also add another variable to see how its spatially distributed.
When you hover over the points, a box will disappear which now shows also the added variable, which lets you compare values of the CO2 emission and the values of the added variable.
Before you actually apply this function it is recommended to first remove outliers of the variables. <br>

In [None]:
plot_point_values(track_df, 'Consumption.value')

## Inspect sample: attributes and corresponding values 
Here you can have a look at the data you just requested. Each row represents one certain point in time with coordinates and a variety of measured phenomena, e.g. Speed, CO2 Emission, temperature, etc . However, each data point also shows some information of the sensor.

In [None]:
track_df

### Get overview on units of variables

In [None]:
show_units(track_df)

## Inspect single datapoint from sample
Here we can inspect a single data point from the sample

In [None]:
track_df.iloc[0]

## Inspect tracks as time series
You can see how the values changes at specific time points. If you haven't noticed, if you hover over the graph in the right above the graph a menu is displayed which allows you to zoom in. If you like to see just a single track you can blend in and out certain tracks by clicking on the relevant track_ids in the legend.

In [None]:
plot_tracks(track_df, 'Speed.value')

## Descriptive statistics of sample
Here we can inspect the descriptives, including mean, standard deviation, min, max and quantiles. The 50th quantile equals the median, i.e. 50% of the data provide a value below the median value and 50% above. 

In [None]:
track_df.describe()

## Distribution

In [None]:
plot_distribution_s(track_df,'CO2.value')

#### Normality of distribution
The Q-Q plot ("quantile-quantile plot")compares the theoretical quantiles expected under a normal distribution to the actual observed values (ordered). When a distribution is normally distributed, you will see a straight line. The more crooked the line is, the farther the distribution departs from normality.

In [None]:
plot_normality_with_qqplot(track_df, 'CO2.value')

## Compare distributions of measurements of same phenomena
You can compare measurements from OBD with measurements based on GPS. 

In [None]:
plot_distribution_s(track_df, 'CO2.value', 'CO2 Emission (GPS-based).value' )

## Linear regression
Visualise the relationship between two variables with a simple linear regression line.

In [None]:
plot_linear_regression(track_df, "CO2.value", "Speed.value" )