In [20]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig
import plotly.express as px

# pandas show all columns of table instead of restricted
pd.set_option('display.max_columns', None)

# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))




# ------------------- data preprocessing functions ----------------


# Percentages correction
def percentages_correction(df):
    '''
        Aim: 
            Set faulty percentages (percentages below 0 and above 100) to nan
        
        Input: 
            Geodataframa
        
        Output: 
            Dataframe with corrected percentages
    '''
    df["faulty_percentages"] = np.nan
    units = df.filter(like='.unit').columns
    values = df.filter(like='.value').columns
    for col in df:
        if col in units:
            if df[col].iloc[0]== "%":
                name = col.split(".")[0] + '.value'
                if name in values:
                    if any(df[name] < 0) or any(df[name] > 100):
                        nanBefore = df[name].isna().sum(axis=0)
                        df[name][df[name] < 0] = np.nan
                        df[name][df[name] > 100] = np.nan
                        nanAfter = df[name].isna().sum(axis=0)
                        corrected = nanAfter - nanBefore
                        print( 'Percentages ok : ', df[name].name, ' Count corrected:', corrected)        
                    else:
                        print('Percentages ok : ', df[name].name )
    
    
    

def flag_faulty_percentages(df):
    '''
        Aim: 
            Inspect if there are faulty percentages (percentages below 0 and above 100)
        
        Input: 
            Geodataframa
        
        Output: 
            Geodataframe with added column which contains when percentages are faulty
    '''
    df["faulty_percentages"] = 0
    units = df_tracks.filter(like='.unit').columns
    # values = df.filter(like='.value').columns

    listNames =[]
    for col in units:
        if df_tracks[col].iloc[0]== '%':
            name = col.split(".")[0] + '.value'
            listNames.append(name)
        
    for variable in listNames:
        df_tracks.loc[df_tracks[variable] < 0, 'faulty_percentages'] = 1
        df_tracks.loc[df_tracks[variable] > 100, 'faulty_percentages'] = 1

    faultyPercentages = (df_tracks['faulty_percentages'].values == 1).sum()
    print('Flagged faulty percentages: ', faultyPercentages)
    # df_tracks.loc[df_tracks['faulty_percentages'] == 1]        
        

        

def flag_implausible_negative_values(df,listOfVariableNames):
    '''
        Aim: Inspect if there are unexpected negative values
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which contains 1 when values are negative
    '''   
    df["implausible_neg_value"] = 0
    for variable in listOfVariableNames:
        df.loc[df[variable] < 0, 'implausible_neg_value'] = 1
    implausibleNegativeValues = (df['implausible_neg_value'].values == 1).sum()
    print('Flagged implausible negative values: ', implausibleNegativeValues)

    
    
    
def flag_outlier_in_sample(df, listOfVariableNames, keep = False):
    '''
        Aim: Find outlier with regard to the sample's distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1' 
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    df['outlier_in_sample'] = 0
    for variable in listOfVariableNames:
        variableName='outlier_in_sample_'+ variable
        df[variableName] = 0
        Q1 = df[variable].quantile(0.25)
        Q3 = df[variable].quantile(0.75)
        IQR = Q3 - Q1
        low_lim = Q1 - 1.5 * IQR 
        up_lim = Q3 + 1.5 * IQR  
        df.loc[df[variable] < low_lim, variableName] = 1
        df.loc[df[variable] > up_lim, variableName] = 1
        df.loc[df[variable] < low_lim, 'outlier_in_sample'] = 1
        df.loc[df[variable] > up_lim, 'outlier_in_sample'] = 1
    outlier = (df['outlier_in_sample'].values == 1).sum()
    print('Flagged outlier in sample: ', outlier)

    
    

def flag_outlier_in_track(df, listOfVariableNames):
    '''
        Aim: Inspect outlier with regard to the tracks' distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1'
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    
    
    def low_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            lower_limit = q1 - 1.5 * iqr
            return lower_limit

    def upper_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            upper_limit = q3 + 1.5 * iqr
            return upper_limit
        
    
    def calculate_limits(df, listOfVariableNames): 
        for variable in listOfVariableNames:
            lowName = 'track_lowerLimit_' + variable
            upName = 'track_upperLimit_' + variable
            df_1 = df.groupby(['track.id']).agg({variable: [low_limit, upper_limit]}).rename(columns={'low_limit': lowName, 'upper_limit': upName})
            df_1.columns = df_1.columns.droplevel(0)
            df = pd.merge(df, df_1, how='inner', on = 'track.id')
        return df
        
    
    df_new = calculate_limits(df, listOfVariableNames)
    df_new['outlier_in_track'] = 0
    for variable in listOfVariableNames:
        df_new.loc[df_new['track_upperLimit_' + variable] < df_new[variable], "outlier_in_track"] = 1 
        df_new.loc[df_new['track_lowerLimit_' + variable] > df_new[variable], "outlier_in_track"] = 1 
        
        variableName='outlier_in_track_'+ variable
        df_new[variableName] = 0
        df_new.loc[df_new['track_upperLimit_' + variable] < df_new[variable], variableName] = 1 
        df_new.loc[df_new['track_lowerLimit_' + variable] > df_new[variable], variableName] = 1
        print(variableName, (df_new[variableName].values == 1).sum())
    outlier = (df_new['outlier_in_track'].values == 1).sum()
    print('Rows in track with outliers : ',outlier)
    return df_new
      
        
        #df.loc[df['upper_limit'] < df[variable], "outlier_in_track"] = 1 
        #df.loc[df['low_limit'] > df[variable], "outlier_in_track"] =1
        #print(df)
        #upperlimV = 'upperLim_'+ variable
        #lowerlimV = 'lowerLim_'+ variable
        #result[upperlimV] = result['upper_limit']
        #result[lowerlimV] = result['low_limit']
        #df.loc[df['upper_limit'] < result[variable], variableName] = 1 
        ##result.loc[result['low_limit'] > result[variable], variableName] =1  
        #result.drop(['low_limit', 'upper_limit'], axis=1, inplace=True)
    #outlier = (result['outlier_in_track'].values == 1).sum()
    #print('Flagged outlier in tracks:', outlier)
    #return result





In [19]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])

# issue a query
df_tracks = track_api.get_tracks(bbox=bbox, num_results=10) 



listNonNegative=['Speed.value', 'CO2.value','Rpm.value',
                 'Consumption (GPS-based).value',
                 'Consumption.value',
                 'CO2 Emission (GPS-based).value']


#flag_faulty_percentages(df_tracks)
#flag_implausible_negative_values(df_tracks,listNonNegative)
#flag_outlier_in_sample(df_tracks, listNonNegative)
df_tracks = flag_outlier_in_track(df_tracks, listNonNegative)
df_tracks



outlier_in_track_Speed.value 3
outlier_in_track_CO2.value 19
outlier_in_track_Rpm.value 6
outlier_in_track_Consumption (GPS-based).value 14
outlier_in_track_Consumption.value 19
outlier_in_track_CO2 Emission (GPS-based).value 14
Rows in track with outliers :  34


Unnamed: 0,id,time,geometry,Engine Load.value,Engine Load.unit,Calculated MAF.value,Calculated MAF.unit,Speed.value,Speed.unit,CO2.value,CO2.unit,Intake Pressure.value,Intake Pressure.unit,Rpm.value,Rpm.unit,Intake Temperature.value,Intake Temperature.unit,Consumption (GPS-based).value,Consumption (GPS-based).unit,GPS Altitude.value,GPS Altitude.unit,Throttle Position.value,Throttle Position.unit,GPS Bearing.value,GPS Bearing.unit,Consumption.value,Consumption.unit,GPS Accuracy.value,GPS Accuracy.unit,CO2 Emission (GPS-based).value,CO2 Emission (GPS-based).unit,GPS Speed.value,GPS Speed.unit,track.id,track.length,track.begin,track.end,sensor.type,sensor.engineDisplacement,sensor.model,sensor.id,sensor.fuelType,sensor.constructionYear,sensor.manufacturer,track_lowerLimit_Speed.value,track_upperLimit_Speed.value,track_lowerLimit_CO2.value,track_upperLimit_CO2.value,track_lowerLimit_Rpm.value,track_upperLimit_Rpm.value,track_lowerLimit_Consumption (GPS-based).value,track_upperLimit_Consumption (GPS-based).value,track_lowerLimit_Consumption.value,track_upperLimit_Consumption.value,track_lowerLimit_CO2 Emission (GPS-based).value,track_upperLimit_CO2 Emission (GPS-based).value,outlier_in_track,outlier_in_track_Speed.value,outlier_in_track_CO2.value,outlier_in_track_Rpm.value,outlier_in_track_Consumption (GPS-based).value,outlier_in_track_Consumption.value,outlier_in_track_CO2 Emission (GPS-based).value
0,5f0ef89c00375c5a2641ef86,2020-07-15T12:37:03,POINT (7.57939 51.96766),30.459892,%,3.113889,g/s,15.978930,km/h,2.405470,kg/h,29.667201,kPa,748.952252,u/min,26.000000,c,0.936199,l/h,115.671012,m,13.000000,%,136.590329,deg,1.023604,l/h,6.000000,%,2.200068,kg/h,17.502660,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,-23.878290,79.178053,-4.103292,13.249693,-415.349501,2715.258818,-1.644844,5.240454,-1.746082,5.638167,-3.865384,12.315067,0,0,0,0,0,0,0
1,5f0ef89c00375c5a2641ef88,2020-07-15T12:37:09,POINT (7.57955 51.96757),49.230105,%,9.778811,g/s,11.134565,km/h,7.554102,kg/h,47.232322,kPa,1475.604745,u/min,25.652838,c,0.937457,l/h,115.280639,m,16.642229,%,134.479803,deg,3.214511,l/h,6.000000,%,2.203025,kg/h,10.324164,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,-23.878290,79.178053,-4.103292,13.249693,-415.349501,2715.258818,-1.644844,5.240454,-1.746082,5.638167,-3.865384,12.315067,0,0,0,0,0,0,0
2,5f0ef89c00375c5a2641ef89,2020-07-15T12:37:14,POINT (7.57988 51.96740),78.649652,%,25.066406,g/s,33.976330,km/h,19.363722,kg/h,74.040426,kPa,2397.395931,u/min,23.728013,c,5.102906,l/h,114.613231,m,23.862069,%,123.313954,deg,8.239881,l/h,6.193485,%,11.991830,kg/h,30.967132,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,-23.878290,79.178053,-4.103292,13.249693,-415.349501,2715.258818,-1.644844,5.240454,-1.746082,5.638167,-3.865384,12.315067,1,0,1,0,0,1,0
3,5f0ef89c00375c5a2641ef8a,2020-07-15T12:37:19,POINT (7.58049 51.96715),31.200400,%,7.419664,g/s,39.000000,km/h,5.731668,kg/h,30.075758,kPa,1748.565672,u/min,24.000000,c,2.280491,l/h,113.835218,m,15.351261,%,125.035780,deg,2.439007,l/h,6.000000,%,5.359154,kg/h,36.898346,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,-23.878290,79.178053,-4.103292,13.249693,-415.349501,2715.258818,-1.644844,5.240454,-1.746082,5.638167,-3.865384,12.315067,0,0,0,0,0,0,0
4,5f0ef89c00375c5a2641ef8b,2020-07-15T12:37:24,POINT (7.58107 51.96682),29.735773,%,3.106679,g/s,34.321667,km/h,2.399900,kg/h,29.000000,kPa,761.854074,u/min,25.000000,c,0.946257,l/h,113.502384,m,13.000000,%,133.482068,deg,1.021234,l/h,7.489919,%,2.223704,kg/h,34.447545,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,-23.878290,79.178053,-4.103292,13.249693,-415.349501,2715.258818,-1.644844,5.240454,-1.746082,5.638167,-3.865384,12.315067,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,5f0db61130d93d341d47203d,2020-07-14T13:40:46,POINT (7.58316 51.97153),30.962647,%,6.402287,g/s,42.291262,km/h,4.945747,kg/h,31.338150,kPa,1455.594663,u/min,25.553366,c,2.209139,l/h,111.865912,m,15.359328,%,283.212185,deg,2.104573,l/h,4.000000,%,5.191477,kg/h,42.250043,km/h,5f0db61130d93d341d47202d,0.917437,2020-07-14T13:39:39Z,2020-07-14T13:41:06Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,22.650072,56.126463,1.247298,9.445156,645.544391,2160.278957,-0.656395,3.453549,0.530765,4.019215,-1.542528,8.115840,0,0,0,0,0,0,0
241,5f0db61130d93d341d47203e,2020-07-14T13:40:51,POINT (7.58244 51.97177),28.418379,%,5.544750,g/s,40.169628,km/h,4.283302,kg/h,28.479041,kPa,1383.047682,u/min,24.661859,c,1.859896,l/h,112.745648,m,14.000000,%,310.772919,deg,1.822682,l/h,4.000000,%,4.370755,kg/h,40.301934,km/h,5f0db61130d93d341d47202d,0.917437,2020-07-14T13:39:39Z,2020-07-14T13:41:06Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,22.650072,56.126463,1.247298,9.445156,645.544391,2160.278957,-0.656395,3.453549,0.530765,4.019215,-1.542528,8.115840,0,0,0,0,0,0,0
242,5f0db61130d93d341d47203f,2020-07-14T13:40:56,POINT (7.58177 51.97199),62.891266,%,13.336113,g/s,35.814871,km/h,10.302106,kg/h,76.601553,kPa,1242.279307,u/min,25.999999,c,0.900534,l/h,113.544173,m,17.493421,%,280.989002,deg,4.383875,l/h,4.000000,%,2.116254,kg/h,35.451608,km/h,5f0db61130d93d341d47202d,0.917437,2020-07-14T13:39:39Z,2020-07-14T13:41:06Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,22.650072,56.126463,1.247298,9.445156,645.544391,2160.278957,-0.656395,3.453549,0.530765,4.019215,-1.542528,8.115840,1,0,1,0,0,1,0
243,5f0db61130d93d341d472040,2020-07-14T13:41:01,POINT (7.58105 51.97203),36.294177,%,6.047093,g/s,34.000000,km/h,4.671360,kg/h,36.070551,kPa,1192.249673,u/min,25.000000,c,0.862190,l/h,113.523253,m,14.220497,%,273.373427,deg,1.987813,l/h,4.000000,%,2.026147,kg/h,34.194837,km/h,5f0db61130d93d341d47202d,0.917437,2020-07-14T13:39:39Z,2020-07-14T13:41:06Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,22.650072,56.126463,1.247298,9.445156,645.544391,2160.278957,-0.656395,3.453549,0.530765,4.019215,-1.542528,8.115840,0,0,0,0,0,0,0


In [209]:
#df_tracks[df_tracks['outlier_in_track'] == 1]

In [47]:
df_tracks['outlier_in_track'].isna().sum()

0

In [None]:
df_tracks.loc[df_tracks['implausible_neg_value']==1,['col2','col3']] = np.nan