## Collect aggregated datapoints for all ids

In [None]:
import warnings
import math
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from dateutil.relativedelta import relativedelta

from accure_io import PostgresInterface, S3Interface, SnowflakeInterface
from accure_io.s3_battery_data_reader import DataNotFoundError, S3BatteryDataReader
from accure_io._meta_data import MetaData
import accure_plot
from accure_io.s3 import list_bucket

from accure_analytics.gaps.find_gaps import find_gaps
from accure_analytics.cycle_counting.rainflow import calculate_rainflow

In [None]:
warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (15,6)
import sys

from accure_analytics import data_version
sys.setrecursionlimit(10000)

level = "pack"
customer = "senec"
pi = PostgresInterface()
dc = S3Interface.get_latest_data_context(customer=customer)
s3i = S3Interface(dc)

battery_reader = S3BatteryDataReader(tenant=customer, data_version="latest")
sfi = SnowflakeInterface(customer=customer)

health_path = "s3://accure-production-artifacts/senec/product=reivolution/data_version=2/run_context=submit-20221004/artifact_type=result/group=FCC_monthly/"
data_version = "2"
version = "221018"
ids = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/id-list/aging_accure_ids_{version}.parquet")

### Test set

In [None]:
save_path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version=1/test-set/"
files = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/s3_id_list.parquet")
accure_ids = files["filename"]
test_ids = []
required_ids = 200
for index, accure_id in enumerate(accure_ids):
    id = accure_id[accure_id.index('=')+1:accure_id.index('.')]
    try:
        meta = battery_reader.read_meta_data(level=level, accure_id=id)
    except DataNotFoundError:
        continue
    days_in_operation = (meta.last_timestamp - meta.first_timestamp).days
    if days_in_operation < 365*2:
        continue
    test_ids.append(id)
    print(index,f"Found {len(test_ids)} of {required_ids}")
    if len(test_ids) == required_ids:
        break



In [None]:
pd.DataFrame({'accure_id':test_ids}).to_parquet(f"s3://accure-sandbox-data/kyung/{customer}/test_id_list_200.parquet")

for index,id in enumerate(test_ids):
    df = pd.DataFrame()
    meta = battery_reader.read_meta_data(level=level, accure_id=id)
    time_start = meta.first_timestamp
    time_end = meta.last_timestamp
    ts_data = s3i.get_timeseries_s3(level=level, accure_id=id, time_start=time_start, time_end=time_end)
    ts_data = ts_data[~ts_data.index.duplicated()]
    invalid = ts_data["voltage"].isna() | ts_data["voltage"]==0 | ts_data["current"].isna()
    ts_data = ts_data[~invalid]
    nom_cap = meta.configurations.iloc[-1]["customer_datasheet"]['agg_capacity_design']
    fcc_df = pd.read_parquet(f'{health_path}FCC_accure_id={id}.parquet')
    soh = fcc_df['FCC_POINTS']/nom_cap
    temperature = ts_data['temperature2']
    soc = ts_data['state_of_charge']
    voltage = ts_data['voltage']
    current = ts_data['current'] # positive = charge
    # rainflow
    measurement_gaps = find_gaps(time_index=current.index)
    rainflow = calculate_rainflow(soc=soc,current=current,gaps=measurement_gaps, idle_current_threshold_a=0)
    rainflow.to_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    # rainflow = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    cd = (soh.diff(periods=2).dropna()/2)
    df["dsoh"] = cd.to_list()
    df["age"] = cd.index
    df["soh"] = soh[1:-1].to_list()
    for i in cd.index:
        # month range of FCC point
        date = time_start + relativedelta(months=i-1)
        start = f"{date.year}-{date.month}-01"
        end = pd.to_datetime(f"{date.year}-{date.month}-01")+relativedelta(months=1)
        end = f"{end.year}-{end.month}-01"
        data = ts_data[(ts_data.index>=start) & (ts_data.index<end)]
        rf = rainflow[(rainflow["time_start"]>=start)&(rainflow["time_end"]<end)]
        df.loc[df["age"]==i,"month"] = date.month
        # indicators
        df.loc[df["age"]==i,"temp_median"] = data["temperature2"].median()
        df.loc[df["age"]==i,"volt_median"] = (data["voltage"].median())
        df.loc[df["age"]==i,"curr_median"] = (data["current"].median())
        df.loc[df["age"]==i,"temp_spread"] = (data["temperature2"].max()-data["temperature2"].min())
        df.loc[df["age"]==i,"dod"] = (rf["dod"].sum())
        df.loc[df["age"]==i,"dod/h"] = ((rf["dod"]/rf["duration_h"]).sum())
        df.loc[df["age"]==i,"energy_total"] = (data["discharge_energy"].max()-data["discharge_energy"].min()+data["charge_energy"].max()-data["charge_energy"].min())
    df["accure_id"] = id
    df.to_parquet(f"{save_path}id={id}.parquet")
    print(f"{index} Processed and saved id: {id}")


### Training set

In [None]:
save_path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version=1/training-set/"
for index,id in enumerate(ids["accure_id"]):
    df = pd.DataFrame()
    meta = battery_reader.read_meta_data(level=level, accure_id=id)
    time_start = meta.first_timestamp
    time_end = meta.last_timestamp
    ts_data = s3i.get_timeseries_s3(level=level, accure_id=id, time_start=time_start, time_end=time_end)
    ts_data = ts_data[~ts_data.index.duplicated()]
    invalid = ts_data["voltage"].isna() | ts_data["voltage"]==0 | ts_data["current"].isna()
    ts_data = ts_data[~invalid]
    nom_cap = meta.configurations.iloc[-1]["customer_datasheet"]['agg_capacity_design']
    fcc_df = pd.read_parquet(f'{health_path}FCC_accure_id={id}.parquet')
    soh = fcc_df['FCC_POINTS']/nom_cap
    temperature = ts_data['temperature2']
    soc = ts_data['state_of_charge']
    voltage = ts_data['voltage']
    current = ts_data['current'] # positive = charge
    # rainflow
    try:
        rainflow = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    except FileNotFoundError:
        measurement_gaps = find_gaps(time_index=current.index)
        rainflow = calculate_rainflow(soc=soc,current=current,gaps=measurement_gaps, idle_current_threshold_a=0)
        rainflow.to_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    cd = (soh.diff(periods=2).dropna()/2)
    df["dsoh"] = cd.to_list()
    df["age"] = cd.index
    df["soh"] = soh[1:-1].to_list() # don't include in model
    for i in cd.index:
        # month range of FCC point
        date = time_start + relativedelta(months=i-1)
        start = f"{date.year}-{date.month}-01"
        end = pd.to_datetime(f"{date.year}-{date.month}-01")+relativedelta(months=1)
        end = f"{end.year}-{end.month}-01"
        data = ts_data[(ts_data.index>=start) & (ts_data.index<end)]
        rf = rainflow[(rainflow["time_start"]>=start)&(rainflow["time_end"]<end)]
        df.loc[df["age"]==i,"month"] = date.month
        # indicators
        df.loc[df["age"]==i,"temp_median"] = data["temperature2"].median() # upper,lower quantile, mean
        df.loc[df["age"]==i,"temp_spread"] = (data["temperature2"].max()-data["temperature2"].min())
        df.loc[df["age"]==i,"volt_median"] = (data["voltage"].median())
        df.loc[df["age"]==i,"curr_median"] = (data["current"].median())
        df.loc[df["age"]==i,"dod"] = (rf["dod"].sum())
        df.loc[df["age"]==i,"dod/h"] = ((rf["dod"]/rf["duration_h"]).sum())
        df.loc[df["age"]==i,"energy_total"] = (data["discharge_energy"].max()-data["discharge_energy"].min()+data["charge_energy"].max()-data["charge_energy"].min())
    df["accure_id"] = id
    df.to_parquet(f"{save_path}id={id}.parquet")
    print(f"{index} Processed and saved id: {id}")


## Statistical plots

In [None]:
# processed combine data
path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version={data_version}/training-set/"
files = list_bucket(path)
df = pd.DataFrame()
for file in files["filename"]:
    if (file.endswith(".parquet")):
        df = pd.concat([df,pd.read_parquet(f"{path}{file}")],ignore_index=True)
df.dropna(inplace=True)

path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version={data_version}/test-set/"
files = list_bucket(path)
df_test = pd.DataFrame()
for file in files["filename"]:
    if (file.endswith(".parquet")):
        df_test = pd.concat([df_test,pd.read_parquet(f"{path}{file}")],ignore_index=True)
df_test

In [None]:
target = df["dsoh"]
features = df.drop(["accure_id","dsoh","soh"],axis=1)
target

In [None]:
target.plot(kind='box')
features.boxplot(by='month',column=['temp_mean'])
features.boxplot(by='month',column=['temp_spread'])
df.boxplot(by='month',column=['dsoh'])
df.boxplot(by='age',column=['dsoh'])

In [None]:
print(features.columns)
df.plot.scatter(y='dsoh',x='temp_mean')
df.plot.scatter(y='dsoh',x='temp_95th')
df.plot.scatter(y='dsoh',x='temp_5th')
df.plot.scatter(y='dsoh',x='temp_spread')
df.plot.scatter(y='dsoh',x='volt_mean')
df.plot.scatter(y='dsoh',x='volt_95th')
df.plot.scatter(y='dsoh',x='volt_5th')
df.plot.scatter(y='dsoh',x='curr_mean')
df.plot.scatter(y='dsoh',x='curr_95th')
df.plot.scatter(y='dsoh',x='curr_5th')
df.plot.scatter(y='dsoh',x='power')
df.plot.scatter(y='dsoh',x='dod')
df.plot.scatter(y='dsoh',x='dod/h')
df.plot.scatter(y='dsoh',x='energy_total')

## Linear Regression

In [None]:
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sklearn.metrics as metrics
from sklearn import preprocessing
from accure_analytics.utils.error_metrics import mean_squared_error as rms

In [None]:
path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/training-set/"
df = pd.read_parquet(path,engine='fastparquet')

path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/test-set/"
files = list_bucket(path)
df_test = pd.DataFrame()
for file in files["filename"]:
    if (file.endswith(".parquet")):
        df_test = pd.concat([df_test,pd.read_parquet(f"{path}{file}")],ignore_index=True)

df_test.dropna(inplace=True)
df.dropna(inplace=True)

### Split by id on 'good data'

In [None]:
n_test = 10
test = df[df["accure_id"].isin(df["accure_id"].unique()[0:n_test])]
train = df[~df["accure_id"].isin(df["accure_id"].unique()[0:n_test])]
y_train = train["dsoh"]
x_train = train.drop(["accure_id","dsoh","soh"],axis=1)
x_test = test.drop(["accure_id","dsoh","soh"],axis=1)
y_test = test["dsoh"]
reg = LinearRegression(normalize=True)
reg.fit(x_train,y_train)
pred = reg.predict(x_test)
print("Mean absolute error: %.6f %%" % (metrics.mean_absolute_error(y_test, pred)*100))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("RMSE: ",math.sqrt(metrics.mean_squared_error(y_test, pred))*100,"%")
for i,v in enumerate(reg.coef_):
    print("Feature: ",x_test.columns[i],"=",v/x_test.mean()[i])
plt.bar(x_test.columns,reg.coef_/x_test.mean())

In [None]:
from accure_analytics.utils.error_metrics import mean_squared_error as rms
result = pd.DataFrame(columns=['accure_id','RMSE_target','RMSE_soh'])
plt.figure(figsize=(10,30))
for index,id in enumerate(test['accure_id'].unique()):
    actual = test[test["accure_id"]==id]['soh'].reset_index(drop=True)
    soh_start = actual.iloc[0]
    x = test.loc[test["accure_id"]==id].drop(["accure_id","dsoh"],axis=1)
    y = reg.predict(x)
    pred_soh = [soh_start]
    for i in range(1,len(actual)):
        pred_soh.append(pred_soh[i-1]+y[i])
    rmse_d = rms(test[test["accure_id"]==id]['dsoh'],y)*100
    rmse_s = rms(actual,pred_soh)*100
    result = result.append({"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s},ignore_index=True)
    # print('RMSE_d = %.2f %%, RMSE_s = %.2f %%' % (rmse_d,rmse_s))
    plt.subplot(n_test,1,index+1)
    plt.plot(np.array(pred_soh)*100,label='prediction')
    plt.plot(actual*100,label='actual')
    plt.ylabel("SOH")
    plt.xlabel("Age (Month)")
    plt.legend()
display(result)
result.mean()

### Randomized train/test set in 50 'good data'

In [None]:
X = df.drop(["accure_id","dsoh"],axis=1)
y = df["dsoh"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=12)
model = LinearRegression()
model.fit(X_train,y_train)
predictions = model.predict(X_test)
# model evaluation
print('Mean Squared Error : ', metrics.mean_squared_error(y_test, predictions))
print('Mean Absolute Error : ', metrics.mean_absolute_error(y_test, predictions))
print("R2 score: %.4f" % metrics.r2_score(y_test, predictions))
print("RMSE: %.6f %%" % (np.sqrt(mean_squared_error(y_test, predictions))*100))

### Train: 50 'good data', Test: 200 'normal data'

In [None]:
test = df_test.dropna()
train = df
y_train = train["dsoh"]
x_train = train.drop(["accure_id","dsoh"],axis=1)
x_test = test.drop(["accure_id","dsoh"],axis=1)
y_test = test["dsoh"]
model = LinearRegression()
model.fit(x_train,y_train)
pred = model.predict(x_test)
# print("R2 score: %.4f" % metrics.r2_score(y_train, pred))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("RMSE: ",rms(y_test, pred)*100,'%')
# print("RMSE: ",rms(y_train, pred)*100,'%')

In [None]:
result = pd.DataFrame(columns=["accure_id","RMSE_target","RMSE_soh"])
for id in test['accure_id'].unique():
    actual = test[test["accure_id"]==id]['soh'].reset_index(drop=True)
    soh_start = actual.iloc[0]
    x = test.loc[test["accure_id"]==id].drop(["accure_id","dsoh"],axis=1)
    y = model.predict(x)
    pred_soh = [soh_start]
    for i in range(1,len(actual)):
        pred_soh.append(pred_soh[i-1]+y[i])
    rmse_d = rms(test[test["accure_id"]==id]['dsoh'],y)*100
    rmse_s = rms(actual,pred_soh)*100
    result = result.append({"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s},ignore_index=True)
    # print('RMSE_d = %.2f %%, RMSE_s = %.2f %%' % (rmse_d,rmse_s))
display(result)
result.mean()

In [None]:
n_test = 10
result = pd.DataFrame(columns=['accure_id','RMSE_target','RMSE_soh'])
plt.figure(figsize=(10,30))
for index,id in enumerate(test['accure_id'].unique()[0:2*n_test:2]):
    actual = test[test["accure_id"]==id]['soh'].reset_index(drop=True)
    soh_start = actual.iloc[0]
    x = test.loc[test["accure_id"]==id].drop(["accure_id","dsoh"],axis=1)
    y = model.predict(x)
    pred_soh = [soh_start]
    for i in range(1,len(actual)):
        pred_soh.append(pred_soh[i-1]+y[i])
    rmse_d = rms(test[test["accure_id"]==id]['dsoh'],y)*100
    rmse_s = rms(actual,pred_soh)*100
    result = result.append({"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s},ignore_index=True)
    meta = battery_reader.read_meta_data(level=level, accure_id=id)
    nom_cap = meta.configurations.iloc[-1]["customer_datasheet"]['agg_capacity_design']
    plt.subplot(n_test,1,index+1)
    plt.plot(np.array(pred_soh)*nom_cap,label='prediction')
    plt.plot(actual*nom_cap,label='actual')
    plt.ylabel("Capacity Forecast")
    plt.xlabel("Age (Month)")
    plt.legend()
display(result)
result.mean()

### Train 200 'bad' data

In [None]:
test = df_test.dropna()
train = df_test
y_train = train["dsoh"]
x_train = train.drop(["accure_id","dsoh"],axis=1)
x_test = test.drop(["accure_id","dsoh"],axis=1)
y_test = test["dsoh"]
model = LinearRegression()
model.fit(x_train,y_train)
pred = model.predict(x_test)
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("RMSE: ",rms(y_test, pred)*100,'%')
result = pd.DataFrame(columns=["accure_id","RMSE_target","RMSE_soh"])
for id in test['accure_id'].unique():
    actual = test[test["accure_id"]==id]['soh'].reset_index(drop=True)
    soh_start = actual.iloc[0]
    x = test.loc[test["accure_id"]==id].drop(["accure_id","dsoh"],axis=1)
    y = model.predict(x)
    pred_soh = [soh_start]
    for i in range(1,len(actual)):
        pred_soh.append(pred_soh[i-1]+y[i])
    rmse_d = rms(test[test["accure_id"]==id]['dsoh'],y)*100
    rmse_s = rms(actual,pred_soh)*100
    result = result.append({"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s},ignore_index=True)
    # print('RMSE_d = %.2f %%, RMSE_s = %.2f %%' % (rmse_d,rmse_s))
display(result)
result.mean()

In [None]:
n_test = 15
result = pd.DataFrame(columns=['accure_id','RMSE_target','RMSE_soh'])
plt.figure(figsize=(10,50))
for index,id in enumerate(test['accure_id'].unique()[0:n_test*2:2]):
    actual = test[test["accure_id"]==id]['soh'].reset_index(drop=True)
    soh_start = actual.iloc[0]
    x = test.loc[test["accure_id"]==id].drop(["accure_id","dsoh"],axis=1)
    y = model.predict(x)
    pred_soh = [soh_start]
    for i in range(1,len(actual)):
        pred_soh.append(pred_soh[i-1]+y[i])
    rmse_d = rms(test[test["accure_id"]==id]['dsoh'],y)*100
    rmse_s = rms(actual,pred_soh)*100
    result = result.append({"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s},ignore_index=True)
    # print('RMSE_d = %.2f %%, RMSE_s = %.2f %%' % (rmse_d,rmse_s))
    plt.subplot(n_test,1,index+1)
    plt.plot(np.array(pred_soh)*100,label='prediction')
    plt.plot(actual*100,label='actual')
    plt.title("RMS_soh = %.2f" %rmse_s)
    plt.legend()