In [1]:
import os

import pandas as pd
from datetime import datetime

import matplotlib.pyplot as plt

from glob import glob
from tqdm import tqdm_notebook as tqdm

import pickle
import json

from io import StringIO, BytesIO

import boto3
from minio import Minio
from minio.error import ResponseError, NoSuchKey


In [2]:
WINDOW_SIZE_MIN = 1

In [3]:
INPUT_BUCKET_NAME = "odometryclassification"

In [4]:
OUTPUTDIRNAME = "events1min"

In [5]:
with open("config.json", "r") as f:
    config = json.load(f)

In [6]:
minioClient = Minio(config["minio_config"]["endpoint_url"].replace("http://","").rstrip("/"),
                    access_key=config["minio_config"]["aws_access_key_id"],
                    secret_key=config["minio_config"]["aws_secret_access_key"],
                    secure=False)

In [7]:
def plot_profiles(df, metadata=None):
    plt.figure(figsize=(15,5))
    if metadata is not None:
        vid = metadata['VehicleID']
        rel = metadata['Relative Error (%)']
        com = metadata["Comment"]
        plt.title(f"Speed profiles - VehicleID {vid}, Relative Error = {rel}, reason: {com}")
    plt.plot(df["axle1RawSpeed"], label="axle1RawSpeed")
    plt.plot(df["axle2RawSpeed"], label="axle2RawSpeed")
    plt.plot(df["trainSpeed"], label="trainSpeed")
    plt.legend()
    plt.show()

In [8]:
def save_event(df, metadata, output_name):
    dirname = os.path.dirname(output_name)
    if not os.path.isdir(dirname):
        os.makedirs(os.path.dirname(dirname), exist_ok=True)
    with open(output_name, "wb") as f:
        pickle.dump({"data":df, "metadata":metadata}, f)

In [9]:
def save_event_to_minio(df, metadata, outname, bucket_name, minioclient):
    if df.shape[0] == 0:
        pass
    
    pickle_byte_obj = pickle.dumps({"data":df, "metadata":metadata})

    if not minioclient.bucket_exists(bucket_name):
        minioclient.make_bucket(bucket_name)
    minioclient.put_object(bucket_name, 
                           outname,  
                           data=BytesIO(pickle_byte_obj),           
                           length=len(pickle_byte_obj))

In [10]:
windows_df = pd.read_csv("window_events.csv")

In [11]:
def get_filename_from_event_info(event):
    vid = int(event["VehicleID"])
    date = pd.to_datetime(event["Timestamp"]).strftime("%Y-%m-%d")
    return f"filtered/{vid}_{date}.csv"

In [12]:
windows_files = [get_filename_from_event_info(event) for _, event in windows_df.iterrows()]

In [13]:
client = boto3.client("s3", 
                      endpoint_url=config["minio_config"]["endpoint_url"],
                      aws_access_key_id=config["minio_config"]["aws_access_key_id"],
                      aws_secret_access_key=config["minio_config"]["aws_secret_access_key"],
                      region_name=config["minio_config"]["region_name"])

paginator = client.get_paginator("list_objects_v2")

all_files = list()
for page in paginator.paginate(Bucket=INPUT_BUCKET_NAME, Prefix="filtered"):
    if "Contents" in page.keys():
        for obj in page["Contents"]:
            all_files.append(obj["Key"])
            

In [14]:
def get_dataframe_from_minio(filename):
    try:
        data = minioClient.get_object(INPUT_BUCKET_NAME, filename)
    except NoSuchKey:
        pass

    data = data.read()
    df = pd.read_csv(BytesIO(data))
    df["TimeStamp"] = pd.to_datetime(df["TimeStamp"])
    df = df.set_index("TimeStamp")
    df = df.sort_index()
       
    df["axle1RawSpeed"] /= 10
    df["axle2RawSpeed"] /= 10

    return df

In [15]:
for file_name in tqdm(all_files):
    
    vid = int(file_name.split("/")[-1].split("_")[0])
    dt =  file_name.split("/")[-1].split("_")[1].strip(".csv")

    df = get_dataframe_from_minio(file_name)        

    # treating files listed in white paper (labelled)
    if file_name in windows_files:
        events = windows_df[(windows_df["VehicleID"] == vid) & (pd.to_datetime(windows_df["Timestamp"]).apply(lambda x: x.date()) == pd.to_datetime(dt))]

        for _, event in events.iterrows():

            begin = pd.to_datetime(event["Timestamp"])
            end = begin + pd.Timedelta(minutes=WINDOW_SIZE_MIN)
            df = df[begin:end]

            if df.shape[0] == 0:
                continue

    #         plot_profiles(df, event)
            ts = pd.to_datetime(event["Timestamp"]).strftime("%Y-%m-%d--%H-%M-%S%f")
            save_event_to_minio(df, event, f"{OUTPUTDIRNAME}/{vid}_{ts}.pkl", INPUT_BUCKET_NAME, minioClient)
    
    # treating files for nominal
    else:
        
        if pd.to_datetime(dt) > pd.to_datetime("2020-03-31"):
            continue
        
        begin = df.index.min()
        end = begin + pd.Timedelta(minutes=WINDOW_SIZE_MIN)

        while(end < df.index.max()):
            df_event = df[begin:end].copy()
            ts = end.strftime("%Y-%m-%d--%H-%M-%S%f")

            if 0 in df_event["trainSpeed"].value_counts().keys():
                if df_event["trainSpeed"].value_counts()[0]/df_event["trainSpeed"].shape[0] > 0.4:
                    begin = end
                    end = begin + pd.Timedelta(minutes=WINDOW_SIZE_MIN)
                    continue
                else:
#                     plot_profiles(df_event)
                    save_event_to_minio(df_event, None, f"{OUTPUTDIRNAME}/nominal/{vid}_{ts}.pkl", INPUT_BUCKET_NAME, minioClient)
                    begin = end
                    end = begin + pd.Timedelta(minutes=WINDOW_SIZE_MIN)
                    continue
            else:
#                 plot_profiles(df_event)
                save_event_to_minio(df_event, None,  f"{OUTPUTDIRNAME}/nominal/{vid}_{ts}.pkl", INPUT_BUCKET_NAME, minioClient)
                begin = end
                end = begin + pd.Timedelta(minutes=WINDOW_SIZE_MIN)
                

HBox(children=(IntProgress(value=0, max=729), HTML(value='')))


