In [10]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import math
from IPython.display import Markdown,display
import datetime
import time
import re
import os

In [37]:
#Takes a dataframe and returns another
#Adds a colmn containing time difference between each line
def delay_col(df):
    df['delay']= pd.Series(np.empty(len(df)), index=df.index)
    for i in range(1,len(df)):
        df['delay'][i]=df['recorded_at'][i]-df['recorded_at'][i-1]
    df['delay'][0]=np.nan
    return df


#Takes a dataframe and time interval and returns a list of dataframes
#Divides each dataframe by ride (Each ride is seperated by the given interval)
def divide_asset_by_time(df,interval):
    df_list = []
    start_line = 0
    for i in range(1,len(df)): 
        if  df['recorded_at'][i]-df['recorded_at'][i-1]> interval:

            df_list.append(df.iloc[start_line:i])
            start_line = i     
        if i == len(df)-1:    
            df_list.append(df.iloc[start_line:len(df)])
    return df_list

#This function takes a dataframe and a string and returns a dataframe
#Fills forward the column in the dataframe
def fill(df,column):
    df[column]=df[column].fillna(method='ffill')
    return df
        
#This function takes a dataframe and returns another
#Input dataframe is a separate ride
#Adds a cloumn for distance travelled between each line
def calcul_dist(df):
    df=df.reset_index()
    df['distance']= pd.Series(np.empty(len(df)), index=df.index)
    for i in range(1,len(df)):
        df['distance'][i] = (df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*df['MDI_OBD_SPEED'][i]/3600
    return df

#This function takes a csv file a time interval a target directory 
#Takes an asset dataframe and :
#Adds a column for time delay between lines/recordings
#Then divides the asset dataframe into seperate rides depending on a given interval
#Forward fills MDI_OBD_SPEED
#Adds a column for fuel consumption between lines/recordings for each ride
#Adds a column for distance traveled between lines/recordings for each ride
#Generates csv files for each ride
def generate_rides_with_avspeed(file_name,interval,directory,n=None):
    df_asset=pd.read_csv(directory+"/"+file_name,parse_dates=[1],nrows=n,na_values=' ')
    os.makedirs(directory+"/"+file_name.split('.')[0]+"_"+str(n))
    new_dir=directory+"/"+file_name.split('.')[0]+"_"+str(n)
    df_asset['GPS_SPEED']=pd.to_numeric(df_asset['GPS_SPEED'],errors='coerce')
    df_asset['MDI_OBD_SPEED']=pd.to_numeric(df_asset['MDI_OBD_SPEED'],errors='coerce')
    df_asset['MDI_OBD_RPM']=pd.to_numeric(df_asset['MDI_OBD_RPM'],errors='coerce')
    df_asset['MDI_OBD_ENGINE_LOAD']=pd.to_numeric(df_asset['MDI_OBD_ENGINE_LOAD'],errors='coerce')
    df_asset['MDI_OBD_FUEL']=pd.to_numeric(df_asset['MDI_OBD_FUEL'],errors='coerce')
    df_asset['ODO_FULL_METER']=pd.to_numeric(df_asset['ODO_FULL_METER'],errors='coerce')
    df_asset['MDI_DASHBOARD_MILEAGE']=pd.to_numeric(df_asset['MDI_DASHBOARD_MILEAGE'],errors='coerce')
    ride_list=divide_asset_by_time(df_asset,interval)
    for idx,df in enumerate(ride_list):
        df=df.reset_index(drop=True)
        df['distance_meter']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['speed_filled']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['meter_filled']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['avrg_speed_5']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['avrg_speed_10']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['avrg_speed_30']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['avrg_speed_all']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['distance_speed']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['distance_cum']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['delay']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['fuel']= pd.Series(np.full(len(df),np.nan), index=df.index)
        last_fuel=np.nan
        last_meter=np.nan
        last_meter_time=np.nan
        last_speed=np.nan
        last_speed_time=np.nan
        cumsum=0
        for i in range(1,len(df)):
            #Fill speed_filled when empty with the last valid value sent
            #Condition
            if(not np.isnan(df['MDI_OBD_SPEED'][i])):
                df['speed_filled'][i] = df['MDI_OBD_SPEED'][i]
                last_speed_time = df['recorded_at'][i]
                last_speed = df['MDI_OBD_SPEED'][i]
            else :
                if(not np.isnan(last_speed)):
            #Filling
                    df['speed_filled'][i]=last_speed
                
            #Fill meter_filled when empty with the last valid value sent
            #Condition
            if(not np.isnan(df['ODO_FULL_METER'][i])):
                df['meter_filled'][i]=df['ODO_FULL_METER'][i]
                df['distance_meter'][i] = df['meter_filled'][i] - last_meter
                last_meter_time=df['recorded_at'][i]
                last_meter=df['ODO_FULL_METER'][i]
            else :
                if(not np.isnan(last_meter)):
            #Filling
                    df['meter_filled'][i]=last_meter
              
            #Fill fuel with the difference between value and last valid value
            #Condition
            if(not np.isnan(df['MDI_OBD_FUEL'][i])):
            
                if(not np.isnan(last_fuel)):
            #Filling
                    df['fuel'][i]=df['MDI_OBD_FUEL'][i]-last_fuel
                else:
                    df['fuel'][i]=np.nan
                last_fuel=df['MDI_OBD_FUEL'][i]    
            else:
                df['fuel'][i]=np.nan
                
            #We calculate the time difference and the distance traveled between values with mdi_obd_speed and time stamp
            df['delay'][i]=(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()
            df['distance_cum'][i] = cumsum + 10*(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*last_speed/36
            cumsum = cumsum + 10*(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*last_speed/36
            df['distance_speed'][i] = 10*(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*last_speed/36
            idx_5 = np.nan
            idx_10 = np.nan
            idx_30 = np.nan
            diff_5 = 5
            diff_10 = 10
            diff_30 = 30
            for j in range(i-1,-1,-1):
                if abs((df['recorded_at'][i]-df['recorded_at'][j]).total_seconds()-5) < diff_5:
                    diff_5 = abs((df['recorded_at'][i]-df['recorded_at'][j]).total_seconds()-5)
                    idx_5 = j
                elif abs((df['recorded_at'][i]-df['recorded_at'][j]).total_seconds()-10) < diff_10:
                    diff_10 = abs((df['recorded_at'][i]-df['recorded_at'][j]).total_seconds()-10)
                    idx_10 = j
                elif abs((df['recorded_at'][i]-df['recorded_at'][j]).total_seconds()-30) < diff_30:
                    diff_30 = abs((df['recorded_at'][i]-df['recorded_at'][j]).total_seconds()-30)
                    idx_30 = j     
                else:
                    break
            if(not np.isnan(idx_5)):
                df['avrg_speed_5'][i] = 3.6*(df['meter_filled'][i]-df['meter_filled'][idx_5])/(df['recorded_at'][i]-df['recorded_at'][idx_5]).total_seconds()
            if(not np.isnan(idx_10)):
                df['avrg_speed_10'][i] = 3.6*(df['meter_filled'][i]-df['meter_filled'][idx_10])/(df['recorded_at'][i]-df['recorded_at'][idx_10]).total_seconds()
            if(not np.isnan(idx_30)):
                df['avrg_speed_30'][i] = 3.6*(df['meter_filled'][i]-df['meter_filled'][idx_30])/(df['recorded_at'][i]-df['recorded_at'][idx_30]).total_seconds()
            A = df['MDI_OBD_SPEED'][0:i]
            df['avrg_speed_all'][i] = A[~np.isnan(A)].mean()
        
        df['fuel'][0] = np.nan    
        df['distance_speed'][0] = np.nan
        df['delay'][0] = np.nan
        df_name=new_dir+"/"+file_name.split('.')[0]+"_"+str(idx)+"."+file_name.split('.')[1]
        df.to_csv(df_name,na_rep=np.nan,index=False)
    return None
        
        
def generate_rides_new_infos(file_name,interval,directory,n=None):
    df_asset=pd.read_csv(directory+"/"+file_name,parse_dates=[1],nrows=n,na_values=' ')
    os.makedirs(directory+"/"+file_name.split('.')[0]+"_"+str(n))
    new_dir=directory+"/"+file_name.split('.')[0]+"_"+str(n)
    df_asset['GPS_SPEED']=pd.to_numeric(df_asset['GPS_SPEED'],errors='coerce')
    df_asset['MDI_OBD_SPEED']=pd.to_numeric(df_asset['MDI_OBD_SPEED'],errors='coerce')
    df_asset['MDI_OBD_RPM']=pd.to_numeric(df_asset['MDI_OBD_RPM'],errors='coerce')
    df_asset['MDI_OBD_ENGINE_LOAD']=pd.to_numeric(df_asset['MDI_OBD_ENGINE_LOAD'],errors='coerce')
    df_asset['MDI_OBD_FUEL']=pd.to_numeric(df_asset['MDI_OBD_FUEL'],errors='coerce')
    df_asset['ODO_FULL_METER']=pd.to_numeric(df_asset['ODO_FULL_METER'],errors='coerce')
    df_asset['MDI_DASHBOARD_MILEAGE']=pd.to_numeric(df_asset['MDI_DASHBOARD_MILEAGE'],errors='coerce')
    ride_list=divide_asset_by_time(df_asset,interval)
    for idx,df in enumerate(ride_list):
        df=df.reset_index(drop=True)
        df['distance_meter']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['speed_filled']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['meter_filled']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['avrg_speed']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['distance_speed']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['distance_cum']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['delay']= pd.Series(np.full(len(df),np.nan), index=df.index)
        df['fuel']= pd.Series(np.full(len(df),np.nan), index=df.index)
        last_fuel=np.nan
        last_meter=np.nan
        last_meter_time=np.nan
        last_speed=np.nan
        last_speed_time=np.nan
        cumsum=0
        for i in range(1,len(df)):
            #Fill speed_filled when empty with the last valid value sent
            #Condition
            if(not np.isnan(df['MDI_OBD_SPEED'][i])):
                df['speed_filled'][i] = df['MDI_OBD_SPEED'][i]
                last_speed_time = df['recorded_at'][i]
                last_speed = df['MDI_OBD_SPEED'][i]
            else :
                if(not np.isnan(last_speed)):
            #Filling
                    df['speed_filled'][i]=last_speed
                
            #Fill meter_filled when empty with the last valid value sent
            #Condition
            if(not np.isnan(df['ODO_FULL_METER'][i])):
                df['meter_filled'][i]=df['ODO_FULL_METER'][i]
                df['distance_meter'][i] = df['meter_filled'][i] - last_meter
                last_meter_time=df['recorded_at'][i]
                last_meter=df['ODO_FULL_METER'][i]
            else :
                if(not np.isnan(last_meter)):
            #Filling
                    df['meter_filled'][i]=last_meter
              
            #Fill fuel with the difference between value and last valid value
            #Condition
            if(not np.isnan(df['MDI_OBD_FUEL'][i])):
            
                if(not np.isnan(last_fuel)):
            #Filling
                    df['fuel'][i]=df['MDI_OBD_FUEL'][i]-last_fuel
                else:
                    df['fuel'][i]=np.nan
                last_fuel=df['MDI_OBD_FUEL'][i]    
            else:
                df['fuel'][i]=np.nan
                
            #We calculate the time difference and the distance traveled between values with mdi_obd_speed and time stamp
            df['delay'][i]=(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()
            df['distance_cum'][i] = cumsum + 10*(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*last_speed/36
            cumsum = cumsum + 10*(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*last_speed/36
            df['distance_speed'][i] = 10*(df['recorded_at'][i]-df['recorded_at'][i-1]).total_seconds()*last_speed/36
            A = df['MDI_OBD_SPEED'][0:i]
            df['avrg_speed'][i] = A[~np.isnan(A)].mean()
        
        df['fuel'][0] = np.nan    
        df['distance_speed'][0] = np.nan
        df['delay'][0] = np.nan
        df_name=new_dir+"/"+file_name.split('.')[0]+"_"+str(idx)+"."+file_name.split('.')[1]
        df.to_csv(df_name,na_rep=np.nan,index=False)
    return None      

In [None]:
start = time.time()
generate_rides_new_infos("487.csv",pd.Timedelta(minutes=5),"data/data_asset_choosed")
print('compilation time : {}'.format(time.time() - start))
