In [1]:
# Import required libraries
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig
import plotly.express as px

# pandas show all columns of table instead of restricted
pd.set_option('display.max_columns', None)

# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))




# ------------------- data preprocessing functions ----------------


# Percentages correction
def percentages_correction(df):
    '''
        Aim: 
            Set faulty percentages (percentages below 0 and above 100) to nan
        
        Input: 
            Geodataframa
        
        Output: 
            Dataframe with corrected percentages
    '''
    df["faulty_percentages"] = np.nan
    units = df.filter(like='.unit').columns
    values = df.filter(like='.value').columns
    for col in df:
        if col in units:
            if df[col].iloc[0]== "%":
                name = col.split(".")[0] + '.value'
                if name in values:
                    if any(df[name] < 0) or any(df[name] > 100):
                        nanBefore = df[name].isna().sum(axis=0)
                        df[name][df[name] < 0] = np.nan
                        df[name][df[name] > 100] = np.nan
                        nanAfter = df[name].isna().sum(axis=0)
                        corrected = nanAfter - nanBefore
                        print( 'Percentages ok : ', df[name].name, ' Count corrected:', corrected)        
                    else:
                        print('Percentages ok : ', df[name].name )
    
    
    

def flag_faulty_percentages(df):
    '''
        Aim: 
            Inspect if there are faulty percentages (percentages below 0 and above 100)
        
        Input: 
            Geodataframa
        
        Output: 
            Geodataframe with added column which contains when percentages are faulty
    '''
    df["faulty_percentages"] = 0
    units = df_tracks.filter(like='.unit').columns
    # values = df.filter(like='.value').columns

    listNames =[]
    for col in units:
        if df_tracks[col].iloc[0]== '%':
            name = col.split(".")[0] + '.value'
            listNames.append(name)
        
    for variable in listNames:
        df_tracks.loc[df_tracks[variable] < 0, 'faulty_percentages'] = 1
        df_tracks.loc[df_tracks[variable] > 100, 'faulty_percentages'] = 1

    faultyPercentages = (df_tracks['faulty_percentages'].values == 1).sum()
    print('Flagged faulty percentages: ', faultyPercentages)
    # df_tracks.loc[df_tracks['faulty_percentages'] == 1] 
    return df
        

        

def flag_implausible_negative_values(df,listOfVariableNames):
    '''
        Aim: Inspect if there are unexpected negative values
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which contains 1 when values are negative
    '''   
    df["implausible_neg_value"] = 0
    for variable in listOfVariableNames:
        df.loc[df[variable] < 0, 'implausible_neg_value'] = 1
    implausibleNegativeValues = (df['implausible_neg_value'].values == 1).sum()
    print('Flagged implausible negative values: ', implausibleNegativeValues)
    return df


    
    
    
def flag_outlier_in_sample(df, listOfVariableNames, dropOutlierColumn=False):
    '''
        Aim: Find outlier with regard to the sample's distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1' 
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    df['outlier_in_sample'] = 0
    for variable in listOfVariableNames:
        variableName='outlier_in_sample_'+ variable
        df[variableName] = 0
        Q1 = df[variable].quantile(0.25)
        Q3 = df[variable].quantile(0.75)
        IQR = Q3 - Q1
        low_lim = Q1 - 1.5 * IQR 
        up_lim = Q3 + 1.5 * IQR  
        df.loc[df[variable] < low_lim, variableName] = 1
        df.loc[df[variable] > up_lim, variableName] = 1
        df.loc[df[variable] < low_lim, 'outlier_in_sample'] = 1
        df.loc[df[variable] > up_lim, 'outlier_in_sample'] = 1
        if dropOutlierColumn == True:
            df.drop([variableName], axis=1, inplace=True)
    outlier = (df['outlier_in_sample'].values == 1).sum()
    print('Flagged outlier in sample: ', outlier)
    return df





def flag_outlier_in_track(df, listOfVariableNames, dropLimits=True, dropOutlierColumn=False):
    '''
        Aim: Inspect outlier with regard to the tracks' distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1'
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    
    def low_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            lower_limit = q1 - 1.5 * iqr
            return lower_limit

    def upper_limit(x):
            q1 = x.quantile(0.25)
            q3 = x.quantile(0.75)
            iqr = q3 - q1
            upper_limit = q3 + 1.5 * iqr
            return upper_limit
        
    
    #def calculate_limits(df, listOfVariableNames): 
     #   for variable in listOfVariableNames:
      #      lowName = 'track_lowerLimit_' + variable
       #     upName = 'track_upperLimit_' + variable
        #    df_1 = df.groupby(['track.id'])
         #   df[lowName] = df_1[variable].transform(low_limit)#.rename(columns={'low_limit': lowName, 'upper_limit': upName})
          #  df[upName] = df_1[variable].transform(upper_limit)
    
            ##df_1 = df.groupby(['track.id']).agg({variable: [low_limit, upper_limit]}).rename(columns={'low_limit': lowName, 'upper_limit': upName})
            ##print(df_1.index)
            ##df_1.columns = df_1.columns.droplevel(0)
            ##df = pd.merge(df, df_1, how='inner', on = 'track.id')     
        #return df
    
    df['outlier_in_track_all'] = 0
    for variable in listOfVariableNames:
            lowName = 'track_lowerLimit_' + variable
            upName = 'track_upperLimit_' + variable
            df_1 = df.groupby(['track.id'])
            df[lowName] = df_1[variable].transform(low_limit)#.rename(columns={'low_limit': lowName, 'upper_limit': upName})
            df[upName] = df_1[variable].transform(upper_limit)
            df.loc[df[upName] < df[variable], "outlier_in_track_all"] = 1 
            df.loc[df[lowName] > df[variable], "outlier_in_track_all"] = 1 
            variableName='outlier_in_track_'+ variable
            df[variableName] = 0
            df.loc[df[upName] < df[variable], variableName] = 1 
            df.loc[df[lowName] > df[variable], variableName] = 1
            print(variableName, (df[variableName].values == 1).sum())
            
            if dropLimits == True:
                df.drop([upName, lowName], axis=1, inplace=True)
            
            if dropOutlierColumn == True:
                df.drop([variableName], axis=1, inplace=True)
    
    outlier = (df['outlier_in_track_all'].values == 1).sum()
    print('Rows which contain outliers in tracks  (there may be multiple outlier in a single row) : ',outlier)
    return df
    
    
    
    #df_new = calculate_limits(df, listOfVariableNames)
    #return df_new
    #df_new['outlier_in_track_all'] = 0
    #print(df_new)
    #for variable in listOfVariableNames:
     #   upName = 'track_upperLimit_' + variable
      #  lowName = 'track_lowerLimit_' + variable
        
       # df_new.loc[df_new[upName] < df_new[variable], "outlier_in_track_all"] = 1 
        #df_new.loc[df_new[lowName] > df_new[variable], "outlier_in_track_all"] = 1 
        
        #variableName='outlier_in_track_'+ variable
        #df_new[variableName] = 0
        #df_new.loc[df_new[upName] < df_new[variable], variableName] = 1 
        #df_new.loc[df_new[lowName] > df_new[variable], variableName] = 1
        #print(variableName, (df_new[variableName].values == 1).sum())
        
        #if dropLimits == True:
         #   df_new.drop([upName, lowName], axis=1, inplace=True)
            
        #if dropOutlierColumn == True:
         #   df_new.drop([variableName], axis=1, inplace=True)
    
    #outlier = (df_new['outlier_in_track_all'].values == 1).sum()
    #print('Rows which contain outliers in tracks  (there may be multiple outlier in a single row) : ',outlier)
    #return df_new


In [4]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])

# issue a query
df_tracks = track_api.get_tracks(bbox=bbox, num_results=10) 
df_tracks.index
df_tracks['index'] = df_tracks.index


listNonNegative=['Speed.value', 'CO2.value','Rpm.value',
                 'Consumption (GPS-based).value',
                 'Consumption.value',
                 'CO2 Emission (GPS-based).value']


#df_faultyPerc = flag_faulty_percentages(df_tracks)
#df_negValues =  flag_implausible_negative_values(df_faultyPerc,listNonNegative)
#df_outlier = flag_outlier_in_sample(df_negValues, listNonNegative,dropOutlierColumn=True)
df_outlier= flag_outlier_in_track(df_tracks, listNonNegative, dropLimits=False)
df_outlier.index


outlier_in_track_Speed.value 3
outlier_in_track_CO2.value 19
outlier_in_track_Rpm.value 6
outlier_in_track_Consumption (GPS-based).value 14
outlier_in_track_Consumption.value 19
outlier_in_track_CO2 Emission (GPS-based).value 14
Rows which contain outliers in tracks  (there may be multiple outlier in a single row) :  34


Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
             8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
           dtype='int64', length=245)

In [None]:
#df_outlier[df_outlier['outlier_in_track_all'] == 1]

In [6]:
# df_tracks.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
            ...
             8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
           dtype='int64', length=245)

In [None]:
df_tracks['outlier_in_track'].isna().sum()

In [None]:
df_tracks.loc[df_tracks['implausible_neg_value']==1,['col2','col3']] = np.nan