# Corvus dataset (DNF)
time-series data requires mini-batching

In [None]:
import warnings
from time import time
from dateutil.relativedelta import relativedelta
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from accure_io import S3Interface
from accure_io.s3_battery_data_reader import DataNotFoundError, S3BatteryDataReader
from accure_io._meta_data import MetaData
from accure_io.s3 import list_bucket
from accure_analytics.gaps.find_gaps import find_gaps
from accure_analytics.cycle_counting.rainflow import calculate_rainflow

In [None]:
# %matplotlib qt
"""General parameters applied as default parameters to the majority of the following functions"""
# plt.rcParams["figure.figsize"] = (16/2.54, 16/2.54)
plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["axes.grid"] = True
plt.rcParams["font.size"] = 11
from cycler import cycler
colors = cycler(
    "color",
    [   "#00549F", # 100% blue
        "73BDFF", # 40% blue
        "#000000", # black
        "00549F", # 100% red
        "B97E00", # 100% yellow
        "#2C9CFF", # 60% blue
        "#C8C8C8", # 20% gray
        "F95265", # 60% red
        "F6A800", # 60% yellow
        "FDC5CC", # 20% red
        "FFDE95", # 20% yellow
    ],
)
plt.rcParams["font.family"] = "Arial"
plt.rc("axes", facecolor="w", axisbelow=True, grid=True, prop_cycle=colors)
# plt.rc("grid", color="k", linestyle="solid", alpha =0.5)
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

In [None]:
warnings.filterwarnings("ignore")
import sys
sys.setrecursionlimit(10000)
pd.set_option('display.max_columns', None)

level = "pack"
customer = "corvus"
dc = S3Interface.get_latest_data_context(customer=customer)
s3i = S3Interface(dc)

# battery_reader = S3BatteryDataReader(tenant=customer, data_version="latest")
s3_reader = S3BatteryDataReader(
        tenant=customer,
        data_version="latest",
        force_onboarding_v1=True,
        environment="production",
        # time_filter=("2015-10-01", "2015-11-19"),
    )

health_path = "s3://accure-project-data/corvus/product=soh_ocv/data_version=1/group=results/"
ids = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/id-list/s3_id_list.parquet")
version = "230103"
save_path = f"s3://accure-sandbox-data/kyung/{customer}/id-list/aging_accure_ids_{version}.parquet"

### find ids

In [None]:
from accure_io.s3 import list_bucket
files = list_bucket(health_path,full_key=True)
files

In [None]:
files.iloc[1:]["filename"].to_frame().to_parquet(f"s3://accure-sandbox-data/kyung/{customer}/id-list/s3_id_list.parquet")

In [None]:
for filename in files['filename'][1:]:
    soh = pd.read_parquet(f'{health_path}{filename}')["soh_c_smoothed"]
    soh.plot()

In [None]:
ts_data[ts_data["voltage"]<=0]

In [None]:
ids["filename"]

In [None]:
s3_reader.read_sensor_data(level=level,accure_id='10416438974066452',drop_duplicates=True)

In [None]:
ts_data.loc[ts_data["voltage"]>600,"voltage"].quantile(0.8)

## Data Preprocessing

In [None]:
data_path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/training-set/version={version}/"
period = 3
window = 2
volt_threshold = 1000
for index,file in enumerate(ids["filename"][3:]):
    id = file[file.index('=')+1:file.index('.')]
    print(id)
    df = pd.DataFrame()
    fcc = pd.read_parquet(f'{health_path}{file}')
    time_start = fcc['time'].iloc[0]
    time_end = fcc['time'].iloc[-1]
    # ts_data = s3_reader.read_sensor_data(level=level,accure_id=id,drop_duplicates=True,time_filter=(time_start,time_end))
    # ts_data = ts_data[~ts_data.index.duplicated()]
    # invalid = ts_data["voltage"].isna() | ts_data["voltage"]<=0 | ts_data["current"].isna()
    # ts_data = ts_data[~invalid]
    soh = fcc["soh_c_smoothed"]
    # try:
    #     rainflow = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    # except FileNotFoundError:
    # measurement_gaps = find_gaps(time_index=current.index)
    # rainflow = calculate_rainflow(soc=soc,current=current,gaps=measurement_gaps, idle_current_threshold_a=0)
    # rainflow.to_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    dsoh = (soh.diff(periods=period).dropna()/period)
    df["dsoh"] = dsoh.tolist()
    df["age"] = dsoh.index
    df["soh"] = soh[period-1:-1].values
    for i in dsoh.index:
        # month range of FCC point
        date = time_start + relativedelta(months=i-1-window)
        start = f"{date.year}-{date.month}-01"
        end_date = pd.to_datetime(f"{date.year}-{date.month}-01")+relativedelta(months=window)
        end = f"{end_date.year}-{end_date.month}-01"
        data = s3_reader.read_sensor_data(level=level,accure_id=id,drop_duplicates=True,time_filter=(start,end))
        data = data[~data.index.duplicated()]
        invalid = data["voltage"].isna() | data["voltage"]<=0 | data["current"].isna()
        data = data[~invalid]
        # data = ts_data[(ts_data.index>=start) & (ts_data.index<end)]
        # rf = rainflow[(rainflow["time_start"]>=start)&(rainflow["time_end"]<end)]
        # df.loc[df["age"]==i,"month"] = date.month
        df.loc[df["age"]==i,"season"] = np.cos((end_date.month-2)*(np.pi/6))
        pow_chg = (data[data["current"]>0]["voltage"]*data[data["current"]>0]["current"]).mean()
        pow_dsg = (data[data["current"]<0]["voltage"]*data[data["current"]<0]["current"]).mean()
        # indicators
        df.loc[df["age"]==i,"temp_mean"] = data["temperature"].mean()
        df.loc[df["age"]==i,"temp_98q"] = data["temperature"].quantile(0.98)
        df.loc[df["age"]==i,"temp_2q"] = data["temperature"].quantile(0.02)
        df.loc[df["age"]==i,"temp_cell_min"] = data["temperature_child_min"].median()
        df.loc[df["age"]==i,"temp_cell_max"] = data["temperature_child_max"].median()
        df.loc[df["age"]==i,"temp_spread"] = (data["temperature_child_max"]-data["temperature_child_min"]).mean()
        df.loc[df["age"]==i,"volt_mean"] = data["voltage"].mean()
        df.loc[df["age"]==i,"volt_98q"] = data["voltage"].quantile(0.98)
        df.loc[df["age"]==i,"volt_2q"] = data["voltage"].quantile(0.02)
        df.loc[df["age"]==i,"volt_over"] = data[data["voltage"]>volt_threshold]["voltage"].count()
        df.loc[df["age"]==i,"cell_drift"] = (data["voltage_cell_max"]-data["voltage_cell_min"]).mean()
        df.loc[df["age"]==i,"curr_mean"] = data["current"].mean()
        df.loc[df["age"]==i,"curr_mean_chg"] = data[data["current"]>0]["current"].mean()
        df.loc[df["age"]==i,"curr_mean_dsc"] = data[data["current"]<0]["current"].mean()
        df.loc[df["age"]==i,"curr_use_chg"] = data[data["current"]>0.5]["current"].mean()
        df.loc[df["age"]==i,"curr_use_dsc"] = data[data["current"]<-0.5]["current"].mean()
        df.loc[df["age"]==i,"curr_98q"] = data["current"].quantile(0.98)
        df.loc[df["age"]==i,"curr_2q"] = data["current"].quantile(0.02)
        df.loc[df["age"]==i,"power_charge_mean"] = pow_chg
        df.loc[df["age"]==i,"power_discharge_mean"] = pow_dsg
    df["accure_id"] = id
    df.to_parquet(f"{data_path}id={id}.parquet")
    print(f"{index+1} Processed and saved id: {id}")
df