In [9]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig
import plotly.express as px

# pandas show all columns of table instead of restricted
pd.set_option('display.max_columns', None)

# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))




# ------------------- data preprocessing functions ----------------


# Percentages correction
def percentages_correction(df):
    '''
        Aim: 
            Set faulty percentages (percentages below 0 and above 100) to nan
        
        Input: 
            Geodataframa
        
        Output: 
            Dataframe with corrected percentages
    '''
    df["faulty_percentages"] = np.nan
    units = df.filter(like='.unit').columns
    values = df.filter(like='.value').columns
    for col in df:
        if col in units:
            if df[col].iloc[0]== "%":
                name = col.split(".")[0] + '.value'
                if name in values:
                    if any(df[name] < 0) or any(df[name] > 100):
                        nanBefore = df[name].isna().sum(axis=0)
                        df[name][df[name] < 0] = np.nan
                        df[name][df[name] > 100] = np.nan
                        nanAfter = df[name].isna().sum(axis=0)
                        corrected = nanAfter - nanBefore
                        print( 'Percentages ok : ', df[name].name, ' Count corrected:', corrected)        
                    else:
                        print('Percentages ok : ', df[name].name )
    
    
    

def flag_faulty_percentages(df):
    '''
        Aim: 
            Inspect if there are faulty percentages (percentages below 0 and above 100)
        
        Input: 
            Geodataframa
        
        Output: 
            Geodataframe with added column which contains when percentages are faulty
    '''
    df["faulty_percentages"] = 0
    units = df_tracks.filter(like='.unit').columns
    # values = df.filter(like='.value').columns

    listNames =[]
    for col in units:
        if df_tracks[col].iloc[0]== '%':
            name = col.split(".")[0] + '.value'
            listNames.append(name)
        
    for variable in listNames:
        df_tracks.loc[df_tracks[variable] < 0, 'faulty_percentages'] = 1
        df_tracks.loc[df_tracks[variable] > 100, 'faulty_percentages'] = 1

    faultyPercentages = (df_tracks['faulty_percentages'].values == 1).sum()
    print('Flagged faulty percentages: ', faultyPercentages)
    # df_tracks.loc[df_tracks['faulty_percentages'] == 1] 
    return df
        

        

def flag_implausible_negative_values(df,listOfVariableNames):
    '''
        Aim: Inspect if there are unexpected negative values
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which contains 1 when values are negative
    '''   
    df["implausible_neg_value"] = 0
    for variable in listOfVariableNames:
        df.loc[df[variable] < 0, 'implausible_neg_value'] = 1
    implausibleNegativeValues = (df['implausible_neg_value'].values == 1).sum()
    print('Flagged implausible negative values: ', implausibleNegativeValues)
    return df


    
    
    
def flag_outlier_in_sample(df, listOfVariableNames, dropOutlierColumn=False):
    '''
        Aim: Find outlier with regard to the sample's distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1' 
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    df['outlier_in_sample'] = 0
    for variable in listOfVariableNames:
        variableName='outlier_in_sample_'+ variable
        df[variableName] = 0
        Q1 = df[variable].quantile(0.25)
        Q3 = df[variable].quantile(0.75)
        IQR = Q3 - Q1
        low_lim = Q1 - 1.5 * IQR 
        up_lim = Q3 + 1.5 * IQR  
        df.loc[df[variable] < low_lim, variableName] = 1
        df.loc[df[variable] > up_lim, variableName] = 1
        df.loc[df[variable] < low_lim, 'outlier_in_sample'] = 1
        df.loc[df[variable] > up_lim, 'outlier_in_sample'] = 1
        print(variableName, (df[variableName].values == 1).sum())
        if dropOutlierColumn == True:
            df.drop([variableName], axis=1, inplace=True)
    outlier = (df['outlier_in_sample'].values == 1).sum()
    print('Flagged outlier in sample: ', outlier)
    return df





def flag_outlier_in_track(df, listOfVariableNames, dropLimits=True, dropOutlierColumn=False):
    '''
        Aim: Inspect outlier with regard to the tracks' distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1'
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    
    def low_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            lower_limit = q1 - 1.5 * iqr
            return lower_limit

    def upper_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            upper_limit = q3 + 1.5 * iqr
            return upper_limit
    
    df['outlier_in_track_all'] = 0
    for variable in listOfVariableNames:
            lowName = 'track_lowerLimit_' + variable
            upName = 'track_upperLimit_' + variable
            df_1 = df.groupby(['track.id'])
            df[lowName] = df_1[variable].transform(low_limit)
            df[upName] = df_1[variable].transform(upper_limit)
            df.loc[df[upName] < df[variable], "outlier_in_track_all"] = 1 
            df.loc[df[lowName] > df[variable], "outlier_in_track_all"] = 1 
            variableName='outlier_in_track_'+ variable
            df[variableName] = 0
            df.loc[df[upName] < df[variable], variableName] = 1 
            df.loc[df[lowName] > df[variable], variableName] = 1
            print(variableName, (df[variableName].values == 1).sum())
            
            if dropLimits == True:
                df.drop([upName, lowName], axis=1, inplace=True)
            
            if dropOutlierColumn == True:
                df.drop([variableName], axis=1, inplace=True)
    
    outlier = (df['outlier_in_track_all'].values == 1).sum()
    print('Rows which contain outliers in tracks  (there may be multiple outlier in a single row) : ',outlier)
    return df
    
    
    


In [14]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])

# issue a query
df_tracks = track_api.get_tracks(bbox=bbox, num_results=30) 


listNonNegative=['Speed.value', 'CO2.value','Rpm.value',
                 'Consumption (GPS-based).value',
                 'Consumption.value',
                 'CO2 Emission (GPS-based).value']


flag_faulty_percentages(df_tracks)
print('---------------------------')
flag_implausible_negative_values(df_tracks,listNonNegative)
print('---------------------------')
flag_outlier_in_sample(df_tracks, listNonNegative,dropOutlierColumn=True)
print('---------------------------')
flag_outlier_in_track(df_tracks, listNonNegative, dropOutlierColumn=True)

Flagged faulty percentages:  5
---------------------------
Flagged implausible negative values:  3
---------------------------
outlier_in_sample_Speed.value 0
outlier_in_sample_CO2.value 8
outlier_in_sample_Rpm.value 0
outlier_in_sample_Consumption (GPS-based).value 30
outlier_in_sample_Consumption.value 8
outlier_in_sample_CO2 Emission (GPS-based).value 30
Flagged outlier in sample:  38
---------------------------
outlier_in_track_Speed.value 40
outlier_in_track_CO2.value 128
outlier_in_track_Rpm.value 9
outlier_in_track_Consumption (GPS-based).value 31
outlier_in_track_Consumption.value 128
outlier_in_track_CO2 Emission (GPS-based).value 31
Rows which contain outliers in tracks  (there may be multiple outlier in a single row) :  193


Unnamed: 0,id,time,geometry,Engine Load.value,Engine Load.unit,Calculated MAF.value,Calculated MAF.unit,Speed.value,Speed.unit,CO2.value,CO2.unit,Intake Pressure.value,Intake Pressure.unit,Rpm.value,Rpm.unit,Intake Temperature.value,Intake Temperature.unit,Consumption (GPS-based).value,Consumption (GPS-based).unit,GPS Altitude.value,GPS Altitude.unit,Throttle Position.value,Throttle Position.unit,GPS Bearing.value,GPS Bearing.unit,Consumption.value,Consumption.unit,GPS Accuracy.value,GPS Accuracy.unit,CO2 Emission (GPS-based).value,CO2 Emission (GPS-based).unit,GPS Speed.value,GPS Speed.unit,track.id,track.length,track.begin,track.end,sensor.type,sensor.engineDisplacement,sensor.model,sensor.id,sensor.fuelType,sensor.constructionYear,sensor.manufacturer,track.appVersion,track.touVersion,GPS HDOP.value,GPS HDOP.unit,GPS PDOP.value,GPS PDOP.unit,GPS VDOP.value,GPS VDOP.unit,MAF.value,MAF.unit,faulty_percentages,implausible_neg_value,outlier_in_sample,outlier_in_track_all
0,5f0ef89c00375c5a2641ef86,2020-07-15T12:37:03,POINT (7.57939 51.96766),30.459892,%,3.113889,g/s,15.978930,km/h,2.405470,kg/h,29.667201,kPa,748.952252,u/min,26.000000,c,0.936199,l/h,115.671012,m,13.000000,%,136.590329,deg,1.023604,l/h,6.000000,%,2.200068,kg/h,17.502660,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,,,0,0,0,0
1,5f0ef89c00375c5a2641ef88,2020-07-15T12:37:09,POINT (7.57955 51.96757),49.230105,%,9.778811,g/s,11.134565,km/h,7.554102,kg/h,47.232322,kPa,1475.604745,u/min,25.652838,c,0.937457,l/h,115.280639,m,16.642229,%,134.479803,deg,3.214511,l/h,6.000000,%,2.203025,kg/h,10.324164,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,,,0,0,0,0
2,5f0ef89c00375c5a2641ef89,2020-07-15T12:37:14,POINT (7.57988 51.96740),78.649652,%,25.066406,g/s,33.976330,km/h,19.363722,kg/h,74.040426,kPa,2397.395931,u/min,23.728013,c,5.102906,l/h,114.613231,m,23.862069,%,123.313954,deg,8.239881,l/h,6.193485,%,11.991830,kg/h,30.967132,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,,,0,0,0,1
3,5f0ef89c00375c5a2641ef8a,2020-07-15T12:37:19,POINT (7.58049 51.96715),31.200400,%,7.419664,g/s,39.000000,km/h,5.731668,kg/h,30.075758,kPa,1748.565672,u/min,24.000000,c,2.280491,l/h,113.835218,m,15.351261,%,125.035780,deg,2.439007,l/h,6.000000,%,5.359154,kg/h,36.898346,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,,,0,0,0,0
4,5f0ef89c00375c5a2641ef8b,2020-07-15T12:37:24,POINT (7.58107 51.96682),29.735773,%,3.106679,g/s,34.321667,km/h,2.399900,kg/h,29.000000,kPa,761.854074,u/min,25.000000,c,0.946257,l/h,113.502384,m,13.000000,%,133.482068,deg,1.021234,l/h,7.489919,%,2.223704,kg/h,34.447545,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,,,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
425,5a4563e644ea85087e0f85f7,2017-12-28T16:47:33,POINT (7.58696 51.96556),64.771711,%,11.119284,g/s,6.095302,km/h,8.589612,kg/h,60.770588,kPa,1218.314924,u/min,6.000000,c,,,106.999997,m,17.503760,%,256.799980,deg,3.655154,l/h,4.383523,%,,,5.399999,km/h,5a4563e644ea85087e0f844b,54.297804,2017-12-28T16:11:58Z,2017-12-28T16:47:53Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,0.900000,precision,1.200000,precision,0.800000,precision,,,0,0,0,0
426,5a4563e644ea85087e0f85f8,2017-12-28T16:47:38,POINT (7.58695 51.96545),31.188777,%,3.397863,g/s,4.893895,km/h,2.624839,kg/h,29.907821,kPa,756.479745,u/min,6.000000,c,,,107.000003,m,13.000000,%,177.399999,deg,1.116953,l/h,5.000000,%,,,6.300000,km/h,5a4563e644ea85087e0f844b,54.297804,2017-12-28T16:11:58Z,2017-12-28T16:47:53Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,0.700000,precision,1.000000,precision,0.700000,precision,,,0,0,0,0
427,5a4563e644ea85087e0f85f9,2017-12-28T16:47:43,POINT (7.58691 51.96535),30.945741,%,3.400678,g/s,13.188079,km/h,2.627013,kg/h,29.615074,kPa,764.590528,u/min,6.000000,c,,,111.431636,m,13.000000,%,182.143171,deg,1.117878,l/h,4.863265,%,,,12.600000,km/h,5a4563e644ea85087e0f844b,54.297804,2017-12-28T16:11:58Z,2017-12-28T16:47:53Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,0.735595,precision,1.035595,precision,0.735595,precision,,,0,0,0,0
428,5a4563e644ea85087e0f85fa,2017-12-28T16:47:48,POINT (7.58693 51.96523),34.319810,%,4.227173,g/s,7.000000,km/h,3.265478,kg/h,28.999999,kPa,970.573035,u/min,6.000000,c,,,111.000000,m,14.000000,%,174.454697,deg,1.389565,l/h,23.265000,%,,,9.000000,km/h,5a4563e644ea85087e0f844b,54.297804,2017-12-28T16:11:58Z,2017-12-28T16:47:53Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,0.900000,precision,1.200000,precision,0.800000,precision,,,0,0,0,0


In [8]:
df_tracks[df_tracks['outlier_in_track_all'] == 1]

Unnamed: 0,id,time,geometry,Engine Load.value,Engine Load.unit,Calculated MAF.value,Calculated MAF.unit,Speed.value,Speed.unit,CO2.value,CO2.unit,Intake Pressure.value,Intake Pressure.unit,Rpm.value,Rpm.unit,Intake Temperature.value,Intake Temperature.unit,Consumption (GPS-based).value,Consumption (GPS-based).unit,GPS Altitude.value,GPS Altitude.unit,Throttle Position.value,Throttle Position.unit,GPS Bearing.value,GPS Bearing.unit,Consumption.value,Consumption.unit,GPS Accuracy.value,GPS Accuracy.unit,CO2 Emission (GPS-based).value,CO2 Emission (GPS-based).unit,GPS Speed.value,GPS Speed.unit,track.id,track.length,track.begin,track.end,sensor.type,sensor.engineDisplacement,sensor.model,sensor.id,sensor.fuelType,sensor.constructionYear,sensor.manufacturer,track.appVersion,track.touVersion,GPS HDOP.value,GPS HDOP.unit,GPS PDOP.value,GPS PDOP.unit,GPS VDOP.value,GPS VDOP.unit,faulty_percentages,implausible_neg_value,outlier_in_sample,outlier_in_track_all
2,5f0ef89c00375c5a2641ef89,2020-07-15T12:37:14,POINT (7.57988 51.96740),78.649652,%,25.066406,g/s,33.976330,km/h,19.363722,kg/h,74.040426,kPa,2397.395931,u/min,23.728013,c,5.102906,l/h,114.613231,m,23.862069,%,123.313954,deg,8.239881,l/h,6.193485,%,11.991830,kg/h,30.967132,km/h,5f0ef89c00375c5a2641ef84,0.665466,2020-07-15T12:37:03Z,2020-07-15T12:38:25Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,0,0,1,1
2,5f0efab600375c5a2642243e,2020-07-15T12:35:32,POINT (7.57595 51.97376),30.499422,%,3.154383,g/s,25.000000,km/h,2.436751,kg/h,29.608347,kPa,757.944002,u/min,25.112260,c,1.365977,l/h,110.464199,m,13.000000,%,149.827598,deg,1.036915,l/h,6.000000,%,3.210046,kg/h,26.798382,km/h,5f0efab600375c5a26422439,0.145549,2020-07-15T12:35:22Z,2020-07-15T12:35:37Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,0,0,0,1
3,5f0efab600375c5a2642243f,2020-07-15T12:35:37,POINT (7.57606 51.97343),58.107638,%,18.381914,g/s,31.943396,km/h,14.199972,kg/h,57.333334,kPa,2270.091696,u/min,23.689230,c,3.297252,l/h,110.918701,m,19.689291,%,186.912093,deg,6.042541,l/h,6.000000,%,7.748543,kg/h,28.774716,km/h,5f0efab600375c5a26422439,0.145549,2020-07-15T12:35:22Z,2020-07-15T12:35:37Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,0,0,0,1
0,5f0ef72f00375c5a2641e5b2,2020-07-15T12:31:51,POINT (7.58860 51.97448),61.407266,%,19.953360,g/s,48.618813,km/h,15.413909,kg/h,64.324409,kPa,2198.642054,u/min,23.999999,c,0.872994,l/h,106.158831,m,19.037705,%,307.832079,deg,6.559110,l/h,8.000000,%,2.051535,kg/h,47.506832,km/h,5f0ef72f00375c5a2641e5b0,1.048481,2020-07-15T12:31:51Z,2020-07-15T12:33:12Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,0,0,0,1
12,5f0ef72f00375c5a2641e5bf,2020-07-15T12:32:52,POINT (7.57891 51.97965),30.980392,%,3.265590,g/s,46.984327,km/h,2.522659,kg/h,30.502355,kPa,761.380457,u/min,25.000000,c,1.089048,l/h,113.295202,m,13.000000,%,318.063799,deg,1.073472,l/h,6.000000,%,2.559263,kg/h,46.874625,km/h,5f0ef72f00375c5a2641e5b0,1.048481,2020-07-15T12:31:51Z,2020-07-15T12:33:12Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,,,,,,,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180,5de9d7a63bdb691868e77eec,2019-11-28T06:57:55,POINT (7.60311 51.93492),87.931569,%,21.818858,g/s,50.034247,km/h,16.855001,kg/h,85.090713,kPa,1744.061529,u/min,12.000000,c,,,109.208207,m,24.468468,%,97.424745,deg,7.172341,l/h,5.437067,%,,,47.970981,km/h,5de9d7a63bdb691868e77e35,10.244009,2019-11-28T06:42:39Z,2019-11-28T07:05:07Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,1.081526,precision,1.681526,precision,1.340763,precision,0,0,1,1
181,5de9d7a63bdb691868e77eed,2019-11-28T06:58:00,POINT (7.60424 51.93493),84.891518,%,24.889829,g/s,59.999998,km/h,19.227316,kg/h,81.357746,kPa,2080.822040,u/min,12.000000,c,,,111.802028,m,25.532164,%,84.975911,deg,8.181837,l/h,6.000000,%,,,58.182189,km/h,5de9d7a63bdb691868e77e35,10.244009,2019-11-28T06:42:39Z,2019-11-28T07:05:07Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,1.918182,precision,2.854545,precision,2.081818,precision,0,0,1,1
183,5de9d7a63bdb691868e77eef,2019-11-28T06:58:10,POINT (7.60686 51.93507),70.340619,%,23.437779,g/s,68.920511,km/h,18.105612,kg/h,66.984328,kPa,2371.534627,u/min,11.000000,c,,,116.383026,m,21.150442,%,91.854281,deg,7.704516,l/h,3.000000,%,,,66.733609,km/h,5de9d7a63bdb691868e77e35,10.244009,2019-11-28T06:42:39Z,2019-11-28T07:05:07Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,1.000000,precision,1.600000,precision,1.244207,precision,0,0,1,1
238,5de9d7a63bdb691868e77f26,2019-11-28T07:02:50,POINT (7.64961 51.94137),89.044375,%,32.442970,g/s,34.296209,km/h,25.062094,kg/h,90.116881,kPa,2465.822605,u/min,14.000000,c,,,101.389206,m,35.256156,%,137.487164,deg,10.664721,l/h,3.631000,%,,,30.914301,km/h,5de9d7a63bdb691868e77e35,10.244009,2019-11-28T06:42:39Z,2019-11-28T07:05:07Z,car,1699,A 170,559e22c2e4b07207d8977998,gasoline,2004,Mercedes Benz,,,0.922445,precision,1.422445,precision,1.161222,precision,0,0,1,1


In [None]:
# df_tracks.index

In [None]:
df_tracks['outlier_in_track'].isna().sum()

In [None]:
df_tracks.loc[df_tracks['implausible_neg_value']==1,['col2','col3']] = np.nan