In [13]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig
import plotly.express as px

# pandas show all columns of table instead of restricted
pd.set_option('display.max_columns', None)

# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))




# ------------------- data preprocessing functions ----------------


# Percentages correction
def percentages_correction(df):
    '''
        Aim: 
            Set faulty percentages (percentages below 0 and above 100) to nan
        
        Input: 
            Geodataframa
        
        Output: 
            Dataframe with corrected percentages
    '''
    df["faulty_percentages"] = np.nan
    units = df.filter(like='.unit').columns
    values = df.filter(like='.value').columns
    for col in df:
        if col in units:
            if df[col].iloc[0]== "%":
                name = col.split(".")[0] + '.value'
                if name in values:
                    if any(df[name] < 0) or any(df[name] > 100):
                        nanBefore = df[name].isna().sum(axis=0)
                        df[name][df[name] < 0] = np.nan
                        df[name][df[name] > 100] = np.nan
                        nanAfter = df[name].isna().sum(axis=0)
                        corrected = nanAfter - nanBefore
                        print( 'Percentages ok : ', df[name].name, ' Count corrected:', corrected)        
                    else:
                        print('Percentages ok : ', df[name].name )
    
    
    

def flag_faulty_percentages(df):
    '''
        Aim: 
            Inspect if there are faulty percentages (percentages below 0 and above 100)
        
        Input: 
            Geodataframa
        
        Output: 
            Geodataframe with added column which contains when percentages are faulty
    '''
    df["faulty_percentages"] = np.nan
    units = df_tracks.filter(like='.unit').columns
    # values = df.filter(like='.value').columns

    listNames =[]
    for col in units:
        if df_tracks[col].iloc[0]== '%':
            name = col.split(".")[0] + '.value'
            listNames.append(name)
        
    for variable in listNames:
        df_tracks.loc[df_tracks[variable] < 0, 'faulty_percentages'] = 1
        df_tracks.loc[df_tracks[variable] > 100, 'faulty_percentages'] = 1

    faultyPercentages = (df_tracks['faulty_percentages'].values == 1).sum()
    print('flagged faulty percentages: ', faultyPercentages)
    # df_tracks.loc[df_tracks['faulty_percentages'] == 1]        
        

        

def flag_implausible_negative_values(df,listOfVariableNames):
    '''
        Aim: Inspect if there are unexpected negative values
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which contains 1 when percentages < 0 or percentages > 100
    '''   
    df["implausible_neg_value"] = np.nan
    for variable in listOfVariableNames:
        df.loc[df[variable] < 0, 'implausible_neg_value'] = 1
    implausibleNegativeValues = (df['implausible_neg_value'].values == 1).sum()
    print('flagged implausible negative values: ', implausibleNegativeValues)

    
    
    
def flag_outlier_in_sample(df, listOfVariableNames):
    '''
        Aim: Find outlier with regard to the sample's distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1' 
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    df['outlier_in_sample'] = np.nan
    for variable in listOfVariableNames:
        #variableName='outlier_in_sample_'+ variable
        #df[variableName] = np.nan
        Q1 = df[variable].quantile(0.25)
        Q3 = df[variable].quantile(0.75)
        IQR = Q3 - Q1
        low_lim = Q1 - 1.5 * IQR 
        up_lim = Q3 + 1.5 * IQR  
        #df.loc[df[variable] < low_lim, variableName] = 1
        #df.loc[df[variable] > up_lim, variableName] = 1
        df.loc[df[variable] < low_lim, 'outlier_in_sample'] = 1
        df.loc[df[variable] > up_lim, 'outlier_in_sample'] = 1
    outlier = (df['outlier_in_sample'].values == 1).sum()
    print('Flagged outlier in sample: ', outlier)

    
    

def flag_outlier_in_track(df, listOfVariableNames):
    '''
        Aim: Inspect outlier with regard to the tracks' distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1'
                when a certain value in of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    def low_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            lower_limit = q1 - 1.5 * iqr
            return lower_limit

    def upper_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            upper_limit = q3 + 1.5 * iqr
            return upper_limit
        
    df['outlier_in_track'] = np.nan
    for variableName in listOfVariableNames:
        df_new = df.groupby(['track.id']).agg({variableName: [low_limit, upper_limit]})
        df_1 = df_new[variableName]
        result = pd.merge(df, df_1, how='inner', on = 'track.id')
        result.loc[result['upper_limit'] < result[variableName], "outlier_in_track"] = 1 
        result.loc[result['low_limit'] > result[variableName], "outlier_in_track"] =1
        result.drop(['low_limit', 'upper_limit'], axis=1, inplace=True)
    outlier = (result['outlier_in_track'].values == 1).sum()
    print('Flagged outlier in tracks:', outlier)
    return result


In [11]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])


# issue a query
df_tracks = track_api.get_tracks(bbox=bbox, num_results=10) 


listNonNegative=['Speed.value', 'CO2.value','Rpm.value',
                 'Consumption (GPS-based).value',
                 'Consumption.value',
                 'CO2 Emission (GPS-based).value']

flag_outlier_in_sample(df_tracks, listNonNegative)
df_tracks = flag_outlier_in_track(df_tracks, listNonNegative)

Flagged outlier in sample:  21
Flagged outlier in tracks: 14


In [12]:
df_tracks[df_tracks['outlier_in_track'] == 1]

Unnamed: 0,id,time,geometry,Engine Load.value,Engine Load.unit,Calculated MAF.value,Calculated MAF.unit,Speed.value,Speed.unit,CO2.value,CO2.unit,Intake Pressure.value,Intake Pressure.unit,Rpm.value,Rpm.unit,Intake Temperature.value,Intake Temperature.unit,Consumption (GPS-based).value,Consumption (GPS-based).unit,GPS Altitude.value,GPS Altitude.unit,Throttle Position.value,Throttle Position.unit,GPS Bearing.value,GPS Bearing.unit,Consumption.value,Consumption.unit,GPS Accuracy.value,GPS Accuracy.unit,CO2 Emission (GPS-based).value,CO2 Emission (GPS-based).unit,GPS Speed.value,GPS Speed.unit,track.id,track.length,track.begin,track.end,sensor.type,sensor.engineDisplacement,sensor.model,sensor.id,sensor.fuelType,sensor.constructionYear,sensor.manufacturer,outlier_in_sample,outlier_in_track
20,5f0efab600375c5a2642243f,2020-07-15T12:35:37,POINT (7.57606 51.97343),58.107638,%,18.381914,g/s,31.943396,km/h,14.199972,kg/h,57.333334,kPa,2270.091696,u/min,23.68923,c,3.297252,l/h,110.918701,m,19.689291,%,186.912093,deg,6.042541,l/h,6.0,%,7.748543,kg/h,28.774716,km/h,5f0efab600375c5a26422439,0.145549,2020-07-15T12:35:22Z,2020-07-15T12:35:37Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,1.0
57,5f0ef5f800375c5a2641e0bb,2020-07-15T12:26:37,POINT (7.59655 51.96494),89.743641,%,34.181204,g/s,38.901514,km/h,26.404875,kg/h,87.849747,kPa,2757.789655,u/min,24.0,c,8.328809,l/h,113.697506,m,30.559156,%,273.046448,deg,11.236117,l/h,6.0,%,19.572702,kg/h,35.358382,km/h,5f0ef5f800375c5a2641e0a5,1.02845,2020-07-15T12:24:55Z,2020-07-15T12:28:14Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0
58,5f0ef5f800375c5a2641e0bc,2020-07-15T12:26:42,POINT (7.59564 51.96499),31.471309,%,7.472338,g/s,49.019356,km/h,5.772358,kg/h,31.221791,kPa,1695.266091,u/min,23.811802,c,5.570882,l/h,113.748906,m,15.0,%,275.929725,deg,2.456323,l/h,4.918756,%,13.091573,kg/h,48.563614,km/h,5f0ef5f800375c5a2641e0a5,1.02845,2020-07-15T12:24:55Z,2020-07-15T12:28:14Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,1.0
72,5f0ef5f800375c5a2641e0ca,2020-07-15T12:27:53,POINT (7.58778 51.96551),29.626369,%,3.66368,g/s,43.000001,km/h,2.830181,kg/h,27.104804,kPa,958.044156,u/min,24.000001,c,5.428459,l/h,116.847857,m,13.830534,%,276.274658,deg,1.204332,l/h,5.715931,%,12.756878,kg/h,42.708304,km/h,5f0ef5f800375c5a2641e0a5,1.02845,2020-07-15T12:24:55Z,2020-07-15T12:28:14Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,1.0
137,5f0ef31300375c5a2641d7c6,2020-07-15T12:07:03,POINT (7.58298 51.96064),97.460171,%,23.31686,g/s,49.162081,km/h,18.012203,kg/h,95.671233,kPa,1710.0,u/min,21.0,c,8.925498,l/h,112.9929,m,34.645802,%,72.651041,deg,7.664767,l/h,6.0,%,20.974921,kg/h,47.16519,km/h,5f0ef31300375c5a2641d7a8,1.860981,2020-07-15T12:04:45Z,2020-07-15T12:08:59Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0
148,5f0ef31300375c5a2641d7d1,2020-07-15T12:07:58,POINT (7.58751 51.96251),99.650115,%,34.780712,g/s,34.818182,km/h,26.867993,kg/h,98.0,kPa,2490.117289,u/min,21.0,c,8.974783,l/h,114.739409,m,54.747369,%,48.71374,deg,11.433188,l/h,4.0,%,21.090739,kg/h,31.309056,km/h,5f0ef31300375c5a2641d7a8,1.860981,2020-07-15T12:04:45Z,2020-07-15T12:08:59Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0
149,5f0ef31300375c5a2641d7d2,2020-07-15T12:08:04,POINT (7.58813 51.96294),44.893976,%,10.893701,g/s,52.0,km/h,8.415351,kg/h,43.036605,kPa,1770.748866,u/min,20.128981,c,7.932498,l/h,115.289726,m,16.697204,%,36.173751,deg,3.581,l/h,4.0,%,18.64137,kg/h,48.764128,km/h,5f0ef31300375c5a2641d7a8,1.860981,2020-07-15T12:04:45Z,2020-07-15T12:08:59Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0
156,5f0ef31300375c5a2641d7d9,2020-07-15T12:08:39,POINT (7.58812 51.96550),22.359308,%,6.170001,g/s,48.000001,km/h,4.766307,kg/h,22.921569,kPa,1874.462267,u/min,18.792453,c,8.427314,l/h,116.481078,m,13.729858,%,275.793385,deg,2.028216,l/h,4.0625,%,19.804188,kg/h,47.63804,km/h,5f0ef31300375c5a2641d7a8,1.860981,2020-07-15T12:04:45Z,2020-07-15T12:08:59Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0
175,5f0ef1a400375c5a2641d580,2020-07-15T12:02:37,POINT (7.57620 51.97189),27.30006,%,5.566578,g/s,43.999999,km/h,4.300165,kg/h,26.456386,kPa,1450.964869,u/min,15.958274,c,6.033324,l/h,121.305272,m,14.814121,%,145.837581,deg,1.829857,l/h,4.49237,%,14.178311,kg/h,43.423582,km/h,5f0ef1a400375c5a2641d56f,0.884093,2020-07-15T12:01:26Z,2020-07-15T12:04:19Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0
197,5f0eef8200375c5a2641d3a5,2020-07-15T11:58:29,POINT (7.58738 51.96546),99.748802,%,37.084032,g/s,37.711892,km/h,28.6473,kg/h,96.105622,kPa,2670.541257,u/min,17.0,c,7.960837,l/h,126.549477,m,63.763513,%,99.574026,deg,12.190341,l/h,8.0,%,18.707966,kg/h,5.906879,km/h,5f0eef8200375c5a2641d36b,1.362954,2020-07-15T11:58:24Z,2020-07-15T12:00:41Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,1.0,1.0


In [None]:
#df_tracks.info()

In [None]:
#print(df_tracks[variable].skew())

In [None]:
def plot_tracks(points_df, column):
    """ 
    Aim: 
        Visualize phenomena of tracks as timeserie in Linechart, in which each line represents one single track
    
    Keyword Arguments: 
        df {Geodataframe} -- point input
        
    Returns:
        Chart is shown 
    
    """
    # Add datetime to data frame
    points_df['datetime'] = pd.to_datetime(points_df['time'])
    points_df.set_index(['track.id','datetime']).sort_index()
    points_df['index']=points_df.index
    fig = px.line(points_df, x="index", y=column, color="track.id",
                  line_group="track.id", hover_name="datetime")
    fig.update_traces(mode='lines+markers')
    fig.show()

plot_tracks(df_tracks, 'Consumption.value')