In [1]:
import pandas as pd
# pandas show all columns of table instead of restricted#
pd.set_option('display.max_columns', None)
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats
import numpy as np
import math

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

# set path to import local envirocar module
import sys
import os
#sys.path.append('..')


from envirocar import TrackAPI, DownloadClient, BboxSelector, ECConfig, Inspection
# create an initial but optional config and an api client
config = ECConfig()
track_api = TrackAPI(api_client=DownloadClient(config=config))
#inspect=Inspection()
#from .EDA.inspection import Inspection

def get_coordinates(df):
        df['lat'] = df['geometry'].apply(lambda coord: coord.y)
        df['lng'] = df['geometry'].apply(lambda coord: coord.x)
        

def get_units(df):
    '''
        Aim: 
            get an overview of the variables and corresponding units
        
        Keyword Arguments: 
            df {Geodataframe} -- point input
        
        Output: Matrix-like overview on variables an the relevant unit
    '''
    units = df.filter(like='.unit').columns
    unitList=[]
    for unit in units:
        if unit in df:
            unitList.append(unit)
            print(df[unit].name, df[unit].iloc[0])
    return(unitList)
            
            

ModuleNotFoundError: No module named 'envirocar.EDA'

In [None]:
bbox = BboxSelector([
    7.554130554199218, # min_x
    51.95590322041212, # min_y
    7.590351104736328, # max_x
    51.97874790276371  # max_y
])

# issue a query
track_df = track_api.get_tracks(bbox=bbox, num_results=40) 
track_df.drop_duplicates(subset=['geometry', 'Engine Load.value', 'Calculated MAF.value',
       'Speed.value', 'CO2.value', 'Intake Pressure.value', 'Rpm.value',
       'Intake Temperature.value', 'Consumption (GPS-based).value',
       'GPS Altitude.value', 'Throttle Position.value', 'GPS Bearing.value',
       'Consumption.value', 'GPS Accuracy.value',
       'CO2 Emission (GPS-based).value', 'GPS Speed.value', 
       'track.length', 'track.begin', 'track.end', 'sensor.type',
       'sensor.engineDisplacement', 'sensor.model', 'sensor.id',
       'sensor.fuelType', 'sensor.constructionYear', 'sensor.manufacturer',
       'track.appVersion', 'track.touVersion', 'GPS HDOP.value',
       'GPS PDOP.value', 'GPS VDOP.value'], inplace=True, keep='last')

In [None]:
track_df.head()

Get subset of numerical data

In [None]:
track_df_numeric=track_df.select_dtypes('float64')
track_df_numeric.head()

In [None]:
track_df['index']=track_df.index
track_df.head()

#### Inspect categorical vairables
As we want to create dummy varaibles for the categorical variables, we will first inspect each categorical
variable.

In [None]:
track_df['sensor.manufacturer'].hist()

In [None]:
track_df['sensor.fuelType'].hist()

In [None]:
track_df['sensor.model'].hist()

#### Simple Preprocessing for only numerical variables

In [None]:
# take only numerical variables
track_df_numeric = track_df.select_dtypes(['float64']).copy()
track_df_numeric['index']=track_df_numeric.index
track_df_numeric

### Preprocessing pipeline for complete dataframe

We create our preprocessing pipeline and apply it on the dataframe.
Here we do a simple median imputation and apply feature scaling in our pipeline.

In [None]:
pipeline_numerical = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

# this returns a numpy array
tracksPiped=pipeline_numerical.fit_transform(track_df_numeric)

In [None]:
# numpy array to pandas dataframe
tracksPiped = pd.DataFrame(tracksPiped, columns=track_df_numeric.columns, index=track_df_numeric['index'])
tracksPiped.head()

### Preprocessing pipeline columnwise
Here we apply processes on specific columns in a datraframe

Create lists of colums per datatype or for which you like to  

In [None]:
attributes_numerical=list(track_df_numeric)
attributes_categorical=['sensor.fuelType','sensor.manufacturer', 'sensor.model']

Create pipeline columnwise.
Here we 'reuse' the pipeline from above for the numerical variables. However, on the categorical variables
we apply the OneHotEncoder

In [None]:
pipeline_full = ColumnTransformer([
    ("num", pipeline_numerical, attributes_numerical),
    ("cat", OneHotEncoder(), attributes_categorical),
])

Apply pipeline on dataframe
This will return a n-d numpy array

In [None]:
processedColumnwise = pipeline_full.fit_transform(track_df)

#### DF creation of numpy array
To create a dataframe from the array we need a list of appropriate names for the columns. Therefore we first create lists from the names of the categories.

In [None]:
fuelTypeList=list(map('sensor.fuelType_'.__add__,track_df['sensor.fuelType'].unique().tolist()))
manuList=list(map('sensor.manufacturer_'.__add__,track_df['sensor.manufacturer'].unique().tolist()))
modelList=list(map('sensor.model_'.__add__,track_df['sensor.model'].unique().tolist()))

Create complete column list

In [None]:
columns1=attributes_numerical+fuelTypeList+manuList+modelList

Create dataframe from array and controle per eye if data and columns are correct

In [None]:
processedColumnwise= pd.DataFrame(processedColumnwise, columns=columns1, index=track_df.index)
processedColumnwise.head()

#### Create custom transformer

In [None]:
def flag_outlier_in_sample(df, listOfVariableNames, dropOutlierColumn=False, setOutlierToNan=False):
    '''
        Aim: Find outlier with regard to the sample's distribution 
        
        Input: Geodataframa
        
        Output: Geodataframe with added column which values are '1' 
                when a certain value of a variable in the list is considered to be an outlier regarding the samples's distribution
    '''
    df['outlier_in_sample'] = 0
    for variable in listOfVariableNames:
        variableName='outlier_in_sample_'+ variable
        df[variableName] = 0
        Q1 = df[variable].quantile(0.25)
        Q3 = df[variable].quantile(0.75)
        IQR = Q3 - Q1
        low_lim = Q1 - 1.5 * IQR 
        up_lim = Q3 + 1.5 * IQR  
        df.loc[df[variable] < low_lim, variableName] = 1
        df.loc[df[variable] > up_lim, variableName] = 1
        df.loc[df[variable] < low_lim, 'outlier_in_sample'] = 1
        df.loc[df[variable] > up_lim, 'outlier_in_sample'] = 1
        print(variableName, (df[variableName].values == 1).sum())
        
        if setOutlierToNan == True:
            df.loc[df[variableName] == 1 , variable] = np.nan
        
        if dropOutlierColumn == True:
            df.drop([variableName], axis=1, inplace=True)
            
    outlier = (df['outlier_in_sample'].values == 1).sum()
    print('Flagged outlier in sample: ', outlier)
    return df


def get_coordinates(df):
        df['lat'] = df['geometry'].apply(lambda coord: coord.y)
        df['lng'] = df['geometry'].apply(lambda coord: coord.x)
        

In [None]:
class GetCoordinates(BaseEstimator, TransformerMixin):
    def _init_(self, get_coordinates=True):
        self.get_coordinates=get_coordinates
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        lat = X[geometry].apply(lambda coord: coord.x)
        if self.get_coordinates:
            return np.c_[X, lat]
        else:
            return np.c_[X]
        
    #coord_add=GetCoordinates(get_coordinates=False)

In [None]:
attributes_geom=['geometry']

In [None]:
pipelinie=ColumnTransformer([
    ('geom', GetCoordinates, attributes_geom),
])

In [None]:
processedGeo = pipelinie.fit_transform(track_df)