In [1]:
import glob

import pandas as pd

# show more columns
pd.options.display.max_columns = 500

# show the large integer as a regular number
pd.options.display.float_format = '{:.0f}'.format

In [2]:
file_list = glob.glob("./data/*.parquet")
dfs = []

for file in file_list:
    df = pd.read_parquet(path=file)
    df['file'] = file
    dfs.append(df)

vehicles_data = pd.concat(dfs)

logger_operations = pd.read_csv(filepath_or_buffer="./data/tires_vehicle_logger_operations.csv", sep=",")

## Basic exploration

### Tires Vehicle Logger Operations

In [None]:
display(logger_operations.info())
display(logger_operations.head(5))
display(logger_operations.tail(5))

In [None]:
print(logger_operations.vehicle_licence_plate.nunique())
print(logger_operations.tireid.nunique())
print(logger_operations.loggerno.nunique())

In [None]:
##8750061 8750076
##CU33706 DB30900
operational_data = logger_operations.loc[logger_operations['loggerno'].isin([8750061, 8750076]), :]
#operational_data = logger_operations.loc[logger_operations['vehicle_licence_plate'].isin(['CU33706', 'DB30900']), :]
operational_data.groupby(by=['loggerno', 'vehicle_licence_plate', 'tireid', 'wheelpos']).count()

### Vehicles data

In [None]:
display(vehicles_data.info())
display(vehicles_data.head(5))
display(vehicles_data.tail(5))

In [None]:
vehicles_data.describe()

## Preprocessing vehicle data

Splitting the file column to get the logger ID.

In [3]:
vehicles_data[['vin', 'loggerno', 'created_at', 'suffix']] = vehicles_data['file'].str.split('_', expand=True)
vehicles_data.drop(columns=['file', 'suffix'], inplace=True)
vehicles_data['vin'] = vehicles_data['vin'].str.replace(pat='^\./data/', repl='', regex=True)
vehicles_data['loggerno'] = vehicles_data['loggerno'].str.lstrip('0').astype(int)
vehicles_data['created_at'] = pd.to_datetime(vehicles_data['created_at'], format='%Y%m')

# drop unnecesary columns
if vehicles_data['ts'].equals(vehicles_data['ts_sec']):
    vehicles_data.drop(columns=['ts_sec'], inplace=True)
vehicles_data.drop(columns=['gps_long', 'gps_lat', 'alt', 'steeringwheel_angle', 'highres', 'vin'], inplace=True)

vehicles_data.head(3)

Unnamed: 0,temp_outside,tachometer_km,wheelspeed_rr,wheelspeed_rl,wheelspeed_fr,wheelspeed_fl,speed,tiretemperature_rr,tiretemperature_rl,tiretemperature_fr,tiretemperature_fl,tirepressure_rr,tirepressure_rl,tirepressure_fr,tirepressure_fl,ts,ts_int,loggerno,created_at
0,,,0,0,0,0,0,3,3,127,127,2,2,2,2,2022-02-01 13:52:01,1643723521000000,8750061,2022-02-01
1,2.0,31111.0,0,0,0,0,0,127,127,127,127,6,6,6,6,2022-02-01 13:52:03,1643723523000000,8750061,2022-02-01
2,2.0,31111.0,0,0,0,0,0,127,127,127,127,6,6,6,6,2022-02-01 13:52:04,1643723524000000,8750061,2022-02-01


In [None]:
vehicles_data.info()

In [None]:
print(vehicles_data.loggerno.unique())

### Unpivoting the vehicle data

In [4]:
def translate_wheel_position(wheel_position):
    translated_wheel_position = ''
    if wheel_position[0] == 'f':
        translated_wheel_position = '1' + wheel_position[1]
    else:
        translated_wheel_position = '2' + wheel_position[1]
    return translated_wheel_position.upper()

In [5]:
df_task01 = vehicles_data.melt(
    id_vars=['loggerno', 'created_at', 'temp_outside', 'tachometer_km', 'speed', 'ts', 'ts_int'], 
    var_name='variable', 
    value_name='value')

df_task01[['parameter', 'wheelpos']] = df_task01['variable'].str.split('_', expand=True)
df_task01.drop('variable', axis=1, inplace=True)

df_task01['wheelpos'] = df_task01['wheelpos'].apply(translate_wheel_position)

# add the value column to the end of the DataFrame
value_column = df_task01.pop('value')
df_task01 = df_task01.assign(value=value_column)

df_task01.head(20)

Unnamed: 0,loggerno,created_at,temp_outside,tachometer_km,speed,ts,ts_int,parameter,wheelpos,value
0,8750061,2022-02-01,,,0,2022-02-01 13:52:01,1643723521000000,wheelspeed,2R,0
1,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:03,1643723523000000,wheelspeed,2R,0
2,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:04,1643723524000000,wheelspeed,2R,0
3,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:05,1643723525000000,wheelspeed,2R,0
4,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:06,1643723526000000,wheelspeed,2R,0
5,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:07,1643723527000000,wheelspeed,2R,0
6,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:08,1643723528000000,wheelspeed,2R,0
7,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:09,1643723529000000,wheelspeed,2R,0
8,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:10,1643723530000000,wheelspeed,2R,0
9,8750061,2022-02-01,2.0,31111.0,0,2022-02-01 13:52:11,1643723531000000,wheelspeed,2R,0


In [7]:
# (13544760, 10)
df_task01.drop_duplicates().shape

## Joining the two datasets

In [None]:
pd.merge(left=logger_operations, right=vehicles_data, how='inner', on='loggerno').head(10000).to_csv('./data/sample.csv')