# Regression Models

In [None]:
import warnings
import math
from time import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from dateutil.relativedelta import relativedelta
import joblib
from sklearn.model_selection import cross_validate

from accure_io import PostgresInterface, S3Interface
from accure_io.s3_battery_data_reader import DataNotFoundError, S3BatteryDataReader
from accure_io._meta_data import MetaData
from accure_io.s3 import list_bucket

from accure_analytics.gaps.find_gaps import find_gaps
from accure_analytics.cycle_counting.rainflow import calculate_rainflow
warnings.filterwarnings("ignore")

In [None]:
%matplotlib qt
"""General parameters applied as default parameters to the majority of the following functions"""
# plt.rcParams["figure.figsize"] = (16/2.54, 16/2.54)
plt.rcParams["figure.figsize"] = (8, 6)
plt.rcParams["axes.grid"] = True
plt.rcParams["font.size"] = 11
from cycler import cycler
colors = cycler(
    "color",[
        "#000000", # black
        "#00549F", # 100% blue
        "#73BDFF", # 40% blue
        "#990516", # 100% red
        "#B97E00", # 100% yellow
        "#C8C8C8", # 20% gray
        "#FFDE95", # 20% yellow
        "#FDC5CC", # 20% red
        "#2C9CFF", # 60% blue
        "#F95265", # 60% red
        "#F6A800", # 60% yellow
    ],
)
plt.rc("axes", facecolor="w", axisbelow=True, grid=True, prop_cycle=colors)
plt.rcParams["font.family"] = "Arial"
# plt.rc("grid", color="k", linestyle="solid", alpha =0.5)
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.spines.top'] = False

In [None]:
import sys
sys.setrecursionlimit(10000)
# plt.style.use("seaborn-muted")
pd.set_option('display.max_columns', None)

level = "pack"
customer = "senec"
dc = S3Interface.get_latest_data_context(customer=customer)
s3i = S3Interface(dc)
battery_reader = S3BatteryDataReader(tenant=customer, data_version="latest")

health_path = "s3://accure-production-artifacts/senec/product=reivolution/data_version=2/run_context=submit-20221204/artifact_type=result/group=FCC_monthly/"

version = "221205"
ids = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/id-list/aging_accure_ids_{version}.parquet")

## Data Processing

In [None]:
data_path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version={version}/training-set/"
period = 3
window = 2
for index,id in enumerate(ids["accure_id"][25:]):
    df = pd.DataFrame()
    meta = battery_reader.read_meta_data(level=level, accure_id=id)
    time_start = meta.first_timestamp
    time_end = meta.last_timestamp
    ts_data = s3i.get_timeseries_s3(level=level, accure_id=id, time_start=time_start, time_end=time_end)
    ts_data = ts_data[~ts_data.index.duplicated()]
    invalid = ts_data["voltage"].isna() | ts_data["voltage"]==0 | ts_data["current"].isna()
    ts_data = ts_data[~invalid]
    nom_cap = meta.configurations.iloc[-1]["customer_datasheet"]['agg_capacity_design']
    soh = pd.read_parquet(f'{health_path}FCC_accure_id={id}.parquet')['FCC_POINTS']/nom_cap*100
    soc = ts_data['state_of_charge']
    current = ts_data['current'] # positive = charge
    # try:
    #     rainflow = pd.read_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    # except FileNotFoundError:
    measurement_gaps = find_gaps(time_index=current.index)
    rainflow = calculate_rainflow(soc=soc,current=current,gaps=measurement_gaps, idle_current_threshold_a=0)
    rainflow.to_parquet(f"s3://accure-sandbox-data/kyung/{customer}/rainflow/id={id}.parquet")
    dsoh = (soh.diff(periods=period).dropna()/period)
    df["dsoh"] = dsoh.to_list()
    df["age"] = dsoh.index
    df["soh"] = soh[period-1:-1].to_list()
    for i in dsoh.index:
        # month range of FCC point
        date = time_start + relativedelta(months=i-1-window)
        start = f"{date.year}-{date.month}-01"
        end_date = pd.to_datetime(f"{date.year}-{date.month}-01")+relativedelta(months=window)
        end = f"{end_date.year}-{end_date.month}-01"
        data = ts_data[(ts_data.index>=start) & (ts_data.index<end)]
        rf = rainflow[(rainflow["time_start"]>=start)&(rainflow["time_end"]<end)]
        # df.loc[df["age"]==i,"month"] = date.month
        df.loc[df["age"]==i,"season"] = np.cos((end_date.month-2)*(np.pi/6))
        idle = (data["current"]<0.5) & (data["current"]>-0.5)
        temp = data['temperature1']
        volt = data[data["voltage"]>5]["voltage"]
        pow_chg = (data[data["current"]>0]["voltage"]*data[data["current"]>0]["current"]).mean()
        pow_dsg = (data[data["current"]<0]["voltage"]*data[data["current"]<0]["current"]).mean()
        # indicators
        df.loc[df["age"]==i,"temp_mean"] = temp.mean()
        df.loc[df["age"]==i,"temp_98q"] = temp.quantile(0.98)
        df.loc[df["age"]==i,"temp_2q"] = temp.quantile(0.02)
        df.loc[df["age"]==i,"temp_spread"] = temp.max()-temp.min()
        df.loc[df["age"]==i,"volt_mean"] = volt.mean()
        df.loc[df["age"]==i,"volt_98q"] = volt.quantile(0.98)
        df.loc[df["age"]==i,"volt_2q"] = volt.quantile(0.02)
        df.loc[df["age"]==i,"curr_mean"] = data["current"].mean()
        df.loc[df["age"]==i,"curr_mean_chg"] = data[data["current"]>0]["current"].mean()
        df.loc[df["age"]==i,"curr_mean_dsc"] = data[data["current"]<0]["current"].mean()
        df.loc[df["age"]==i,"curr_use_chg"] = data[data["current"]>0.5]["current"].mean()
        df.loc[df["age"]==i,"curr_use_dsc"] = data[data["current"]<-0.5]["current"].mean()
        df.loc[df["age"]==i,"curr_98q"] = data["current"].quantile(0.98)
        df.loc[df["age"]==i,"curr_2q"] = data["current"].quantile(0.02)
        # df.loc[df["age"]==i,"power"] = (data["current"]*data["voltage"]).sum()
        df.loc[df["age"]==i,"power_charge_mean"] = pow_chg
        df.loc[df["age"]==i,"power_discharge_mean"] = pow_dsg
        df.loc[df["age"]==i,"dod"] = rf["dod"].sum()
        df.loc[df["age"]==i,"dod/h"] = (rf["dod"]/rf["duration_h"]).sum()
        df.loc[df["age"]==i,"energy_total"] = (data["discharge_energy"].max()-data["discharge_energy"].min()+data["charge_energy"].max()-data["charge_energy"].min())
    df["accure_id"] = id
    df.to_parquet(f"{data_path}id={id}.parquet")
    print(f"{index+1} Processed and saved id: {id}")
    df


## Load data

In [None]:
path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version={version}/training-set/"
files = list_bucket(path)
df = pd.DataFrame()
for file in files["filename"]:
    if (file.endswith(".parquet")):
        df = pd.concat([df,pd.read_parquet(f"{path}{file}")],ignore_index=True)
df.dropna(inplace=True)
df.to_parquet(f"/Users/kenny/accure_local/senec/training_{version}.parquet")

In [None]:
path = f"s3://accure-sandbox-data/kyung/{customer}/model-data/version={version}/test-set/"
files = list_bucket(path)
df_test = pd.DataFrame()
for file in files["filename"]:
    if (file.endswith(".parquet")):
        df_test = pd.concat([df_test,pd.read_parquet(f"{path}{file}")],ignore_index=True)
display(df_test.isna())
df_test.dropna(inplace=True)
# df.to_parquet("/Users/kenny/accure_local/senec/test.parquet")

### Saved data

In [None]:
df = pd.read_parquet(f"/Users/kenny/accure_local/senec/training_{version}.parquet")
# df_test = pd.read_parquet("/Users/kenny/accure_local/senec/test.parquet")

In [None]:
# check key stats of input features
df.drop(["accure_id"],axis=1).describe().transpose()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sklearn.metrics as metrics
from accure_analytics.utils.error_metrics import mean_squared_error as rms

In [None]:
# separate data into training and test set randomly
verif_id = ids["accure_id"].sample(frac=0.1,random_state=20)
verif = df[df["accure_id"].isin(verif_id)]
x_verif = verif.drop(["accure_id","dsoh","soh","month"],axis=1)
y_verif = verif["dsoh"].values
train_set = df[~df["accure_id"].isin(verif_id)]
x = train_set.drop(["accure_id","dsoh","soh","month"],axis=1)
y = train_set["dsoh"].values
shuffle_set = train_set.sample(frac=1, random_state=12)
x_shuffle = shuffle_set.drop(["accure_id","dsoh","soh","month"],axis=1)
y_shuffle =  shuffle_set["dsoh"].values
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_scaled = sc.fit_transform(x)
x_train, x_test, y_train, y_test = train_test_split(
    x_scaled, y, test_size=0.2, random_state=12)


In [None]:
# split by ids
test_id = train_set["accure_id"].drop_duplicates().sample(frac=0.25,random_state=200)
test = train_set[train_set["accure_id"].isin(test_id)]
train = train_set[~train_set["accure_id"].isin(test_id)]
x_train = train.drop(["accure_id","dsoh","soh","month"],axis=1)
x_test = test.drop(["accure_id","dsoh","soh","month"],axis=1)
y_train = train["dsoh"].values
y_test = test["dsoh"].values

## Visualization

In [None]:
sns.pairplot(df.drop(["accure_id","soh"],axis=1),diag_kind='kde')

In [None]:
# input features to target output
data = df.drop(["accure_id","soh"],axis=1).sample(frac=0.3,random_state=200)
feat = data.drop(["dsoh"],axis=1)
y = data['dsoh']
fig, axs = plt.subplots(7,3,figsize=(7,20),constrained_layout=True)
fig.supylabel("∆SOH")
fig.suptitle("Dataset Alpha Scatter Plot")
for i in range(7):
    for j in range(3):
        col = feat.columns[3*i+j]   
        axs[i,j].scatter(feat[col],y,c="#00549F",s=0.5,label=col)
        axs[i,j].set_xlabel(col)
plt.savefig("/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/figures/corr_senec.svg")

## Linear Regression

In [None]:
import joblib
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
lr.fit(x_train,y_train)
pred = lr.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (np.sqrt(mean_squared_error(y_test, pred))))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("LR score: %.4f" % lr.score(x_test, y_test))
joblib.dump(lr,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/lr.sav")

In [None]:
avg = np.array([])
for i, (train_index, test_index) in enumerate(kf.split(train_set)):
    x_train = train_set.iloc[train_index].drop(["accure_id","dsoh","soh","month"],axis=1)
    y_train = train_set.iloc[train_index]["dsoh"].values
    x_test = train_set.iloc[test_index].drop(["accure_id","dsoh","soh","month"],axis=1)
    y_test = train_set.iloc[test_index]["dsoh"].values
    print(f"Fold {i}:")
    lr = LinearRegression(normalize=True)
    lr.fit(x_train,y_train)
    pred = lr.predict(x_test)
    # model evaluation
    rmse = (np.sqrt(mean_squared_error(y_test, pred)))
    avg = np.append(avg,rmse)
    print("RMSE: %.6f %%" % rmse)
    print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
    print("LR score: %.4f" % lr.score(x_test, y_test))
print(avg)
print(avg.mean())


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict

lr = LinearRegression(normalize=True)
pred = cross_val_predict(lr,x_shuffle,y_shuffle,cv=4)

# model evaluation
print("RMSE: %.6f" % (np.sqrt(mean_squared_error(y, pred))))
print("R2 score: %.4f" % metrics.r2_score(y_shuffle, pred))

In [None]:
for i,v in enumerate(lr.coef_):
    print("Feature: ",x.columns[i],"=",v)
    if v > 0.05:
        print("High positive correlation")
    elif v < -0.05:
        print("High negative correlation")
plt.figure(figsize=(8,5),tight_layout=True)
plt.xticks(rotation=45, ha="right")
plt.bar(x.columns,np.abs(lr.coef_))
plt.ylabel("Coefficient Magnitude")
plt.title("Input Feature Correlations from Dataset Alpha")
plt.savefig("/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/figures/coef_senec.svg")

In [None]:
result = pd.DataFrame()
plt.figure(figsize=(10,30))
n_test = 8
skip = 5
test_id = df["accure_id"].unique()[0:n_test*skip:skip]
test = df[df["accure_id"].isin(test_id)]
for index,id in enumerate(test_id):
    data = test[test["accure_id"]==id]
    actual = data['soh'].tolist()
    soh_start = actual[0]
    x = data.drop(["accure_id","dsoh","soh"],axis=1)
    y = model.predict(x)
    pred_soh = [soh_start]
    for i in range(1,len(actual)):
        pred_soh.append(pred_soh[i-1]+y[i])
    rmse_d = rms(data['dsoh'],y)
    rmse_s = rms(np.array(actual),pred_soh)
    result = result.append(
        {"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s,"Final_diff":(np.abs(actual[-1]-pred_soh[-1]))},ignore_index=True)
    # print('RMSE_d = %.2f %%, RMSE_s = %.2f %%' % (rmse_d,rmse_s))
    plt.subplot(n_test,1,index+1)
    plt.plot(np.array(pred_soh),label='prediction')
    plt.plot(actual,label='actual')
    plt.ylabel("SOH")
    plt.xlabel("Age (Month)")
    plt.legend()
display(result)
display(result.mean())

### Ridge Regularization - Linear least squares with l2 (Tikhonov) regularization

In [None]:
from sklearn.linear_model import Ridge 
ridge = Ridge(normalize=True,alpha=0)
ridge.fit(x_train,y_train)
pred = ridge.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
joblib.dump(ridge,f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/{customer}/ridge.sav")

### Lasso - L1 prior as regularizer

In [None]:
from sklearn.linear_model import Lasso 
lasso = Lasso(alpha=0)
lasso.fit(x_train,y_train)
pred = lasso.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
joblib.dump(lasso,f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/{customer}/lasso.sav")

### ElasticNet - combined L1 and L2 priors as regularizer

In [None]:
from sklearn.linear_model import ElasticNet 
en = ElasticNet(alpha=0)
en.fit(x_train,y_train)
pred = en.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
joblib.dump(en,f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/{customer}/en.sav")

## Polynomial Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
poly_feat = PolynomialFeatures(degree=2,include_bias=False)
# features = poly_feat.fit_transform(x_train)
poly = make_pipeline(poly_feat,LinearRegression())
# poly.fit(features,y_train)
poly.fit(x_train,y_train)
# pred = poly.predict(poly_feat.transform(x_test))
pred = poly.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
joblib.dump(poly,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/poly.sav")

## Gaussian Process Regression

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process import kernels
gpr = GaussianProcessRegressor(kernel=kernels.RationalQuadratic())
# model = GaussianProcessRegressor(kernel=0.5*kernels.RBF())
gpr.fit(x_train,y_train)
pred,std = gpr.predict(x_test,return_std=True)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("GPR score: %.4f" % gpr.score(x_test,y_test))
print("Max prediction: ", min(pred))
joblib.dump(gpr,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/gpr_rq.sav")

In [None]:
t0=time()
pred,std = gpr.predict(x_test,return_std=True)
t1=time()
print(t1-t0)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'kernel': [
            # kernels.RBF(),kernels.RBF(length_scale=4),
            # kernels.RationalQuadratic(),
            # kernels.ExpSineSquared(),
            # kernels.Product(kernels.ConstantKernel(0.5),kernels.RBF()),
            # 0.5*kernels.ExpSineSquared()
            
            ]}  
grid = GridSearchCV(GaussianProcessRegressor(), param_grid, cv=2, verbose=3) 
grid.fit(x_train, y_train)
print(grid.best_params_) 

## Support Vector Regression

In [None]:
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
svr = SVR(kernel='rbf',epsilon=0.03,C=5)
svr.fit(x_train,y_train)
pred = svr.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("SVR score: %.4f" % svr.score(x_train,y_train))
# joblib.dump(svr,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/svr.sav")

In [None]:
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(x_train,y_train)
pred = svr.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
# print("SVR score: %.4f" % svr.score(x_train,y_train))
joblib.dump(svr,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/svr.sav")

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'kernel': ['rbf','poly','linear'],
            'epsilon':[0.1,0.5,1]
            }  
grid = GridSearchCV(SVR(), param_grid, cv=2, verbose=2) 
grid.fit(x_train, y_train)
print(grid.best_params_) 

## Principle Component Regression

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
pcr = make_pipeline(StandardScaler(), PCA(n_components=20), LinearRegression())
pcr.fit(x_train, y_train)
pred = pcr.predict(x_test) 
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("PCR score: %.4f" % pcr.score(x_train,y_train))
joblib.dump(pcr,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/pcr.sav")

## Partial Least Squares Regression

In [None]:
from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=20)
pls.fit(x_train,y_train)
pred = pls.predict(x_test).flatten()
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("PLSR score: %.4f" % pls.score(x_train,y_train))
joblib.dump(pls,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/pls.sav")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

X_train, X_test, Y_train, Y_test = train_test_split(x, y, random_state=1)

pcr = make_pipeline(StandardScaler(), PCA(n_components=1), LinearRegression())
pcr.fit(X_train, Y_train)
pca = pcr.named_steps["pca"]  # retrieve the PCA step of the pipeline

pls = PLSRegression(n_components=1)
pls.fit(X_train, Y_train)

fig, axes = plt.subplots(1, 2, figsize=(10, 3))
axes[0].scatter(pca.transform(X_test), Y_test, alpha=0.3, label="ground truth")
axes[0].scatter(
    pca.transform(X_test), pcr.predict(X_test), alpha=0.3, label="pred"
)
axes[0].set(
    xlabel="Projected data onto first PCA component", ylabel="y", title="PCR / PCA"
)
axes[0].legend()
axes[1].scatter(pls.transform(X_test), Y_test, alpha=0.3, label="ground truth")
axes[1].scatter(
    pls.transform(X_test), pls.predict(X_test), alpha=0.3, label="pred"
)
axes[1].set(xlabel="Projected data onto first PLS component", ylabel="y", title="PLS")
axes[1].legend()
plt.tight_layout()
plt.show()

## Neural Network

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
# nn = MLPRegressor(hidden_layer_sizes=100,activation='relu',alpha=0.5,learning_rate='adaptive',verbose=True)
nn = MLPRegressor(hidden_layer_sizes=(50,50),activation='relu',alpha=0.1,learning_rate='constant')
nn.fit(x_train, y_train)
pred = nn.predict(x_test) 
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("Model score: %.4f" % nn.score(x_train,y_train))
joblib.dump(nn,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/nn_2l.sav")

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
nn = MLPRegressor(hidden_layer_sizes=(100,100,100,100),activation='relu',alpha=0.1,learning_rate='constant',random_state=200)
nn.fit(x_train, y_train)
pred = nn.predict(x_test) 
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("Model score: %.4f" % nn.score(x_train,y_train))
# joblib.dump(nn,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/nn_3l.sav")

In [None]:
from sklearn.neural_network import MLPRegressor
nn = MLPRegressor(hidden_layer_sizes=(200,200,100,100),activation='relu',
                alpha=0.01,learning_rate='adaptive',random_state=200,warm_start=False)
nn.fit(x_train, y_train)
pred = nn.predict(x_test)
# model evaluation
print("RMSE: %.6f %%" % (rms(y_test, pred)))
print("R2 score: %.4f" % metrics.r2_score(y_test, pred))
print("Model score: %.4f" % nn.score(x_train,y_train))
joblib.dump(nn,"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/senec/nn_3l.sav")

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'hidden_layer_sizes': [(100,),(250,200,200),(200,200,100),(100,50,100)],
            'alpha':[0.07,0.1,0.15],
            'activation':['tanh','relu']
            }  
grid = GridSearchCV(MLPRegressor(), param_grid, cv=2, verbose=2) 
grid.fit(x_train, y_train)
print(grid.best_params_) 

# Combined

In [None]:
# collect performance metrics
from time import time
from sklearn.model_selection import cross_validate
result = pd.DataFrame()
cv = 4
cross = pd.DataFrame()

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
t0 = time()
lr.fit(x_train,y_train)
pred = lr.predict(x_test)
t1 = time()
stats = pd.DataFrame(cross_validate(lr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Linear','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)


from sklearn.linear_model import Ridge 
ridge = Ridge(alpha=0)
t0 = time()
ridge.fit(x_train,y_train)
pred = ridge.predict(x_test)
t1 = time()
stats = pd.DataFrame(cross_validate(ridge,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Ridge','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.linear_model import Lasso 
lasso = Lasso(alpha=0)
t0 = time()
lasso.fit(x_train,y_train)
pred = lasso.predict(x_test)
t1 = time()
stats = pd.DataFrame(cross_validate(lasso,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Lasso','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.linear_model import ElasticNet 
en = ElasticNet(alpha=0)
t0 = time()
en.fit(x_train,y_train)
pred = en.predict(x_test)
t1 = time()
stats = pd.DataFrame(cross_validate(en,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'ElasticNet','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
poly_feat = PolynomialFeatures(degree=2,include_bias=False)
poly = make_pipeline(poly_feat,LinearRegression())
poly.fit(x_train,y_train)
pred = poly.predict(x_test)
stats = pd.DataFrame(cross_validate(poly,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Poly','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process import kernels
gpr = GaussianProcessRegressor(kernel=kernels.RationalQuadratic())
t0 = time()
gpr.fit(x_train,y_train)
pred = gpr.predict(x_test)
t1 = time()
stats = pd.DataFrame(cross_validate(gpr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'GPR','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
svr = SVR(kernel='rbf',epsilon=0.03,C=5)
t0 = time()
svr.fit(x_train,y_train)
pred = svr.predict(x_test)
t1 = time()
stats = pd.DataFrame(cross_validate(svr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'SVR','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
pcr = make_pipeline(PCA(n_components=20), LinearRegression())
t0 = time()
pcr.fit(x_train, y_train)
pred = pcr.predict(x_test) 
t1 = time()
stats = pd.DataFrame(cross_validate(pcr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'PCA','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.cross_decomposition import PLSRegression
pls = PLSRegression(n_components=6)
t0 = time()
pls.fit(x_train,y_train)
pred = pls.predict(x_test).flatten()
t1 = time()
stats = pd.DataFrame(cross_validate(pls,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'PLS','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

from sklearn.neural_network import MLPRegressor
nn3 = MLPRegressor(hidden_layer_sizes=(200,200,100,100),activation='relu',alpha=0.01,learning_rate='adaptive',random_state=200)
t0 = time()
nn3.fit(x_train, y_train)
pred = nn3.predict(x_test) 
t1 = time()
stats = pd.DataFrame(cross_validate(nn3,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'NN','runtime':(t1-t0),'RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

display(result)

In [None]:
pd.concat([result,result.rank(axis=0)],axis=1).to_csv(f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/results_{customer}.csv")

# Test set

In [None]:
# load models
path = f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/sklearn models/{customer}"
lr = joblib.load(f"{path}/lr.sav")
ridge = joblib.load(f"{path}/ridge.sav")
lasso = joblib.load(f"{path}/lasso.sav")
en = joblib.load(f"{path}/en.sav")
poly = joblib.load(f"{path}/poly.sav")
pls = joblib.load(f"{path}/pls.sav")
pcr = joblib.load(f"{path}/pcr.sav")
gpr = joblib.load(f"{path}/gpr_rq.sav")
svr = joblib.load(f"{path}/svr.sav")
nn1 = joblib.load(f"{path}/nn_1l.sav")
nn2 = joblib.load(f"{path}/nn_2l.sav")
nn3 = joblib.load(f"{path}/nn_3l.sav")

In [None]:
# all test plots
result = pd.DataFrame()
models = [lr,poly,gpr,svr,nn3]
test_id = verif_id
for index,id in enumerate(test_id):
    fig,ax = plt.subplots(figsize=(8,8))
    data = verif[verif["accure_id"]==id]
    actual = data['soh'].values
    # period = data.shape[0]-18
    period = 12
    input = data.iloc[period:].drop(["accure_id","dsoh","soh","month"],axis=1).values
    input = sc.transform(input)
    ax.plot(actual,label='Actual')
    for model in models:
        y  = model.predict(input).flatten()
        pred_soh = actual[0:period]
        for i in range(0,len(y)):
            pred_soh = np.append(pred_soh,pred_soh[-1]+y[i])
        rmse_d = rms(data['dsoh'][period:],y)
        rmse_s = rms(actual,pred_soh)
        result = result.append(
            {"model":model.__class__.__name__,"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s,"Final_diff":(np.abs(actual[-1]-pred_soh[-1]))},ignore_index=True)
        ax.plot(np.arange(period-1,len(pred_soh)),pred_soh[period-1:],'^-',label=str(model.__class__.__name__))
    plt.ylabel("SOH (%)")
    plt.xlabel("Age (Month)")
    plt.title(f"Index {index} ID:{id}")
    # plt.legend(["Actual","Linear","Polynomial","GPR","SVR","MLPR"])
    plt.legend()
result = result.groupby('model').mean()
display(result)
pd.concat([result,result.rank(axis=0)],axis=1).to_csv(f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/test_{customer}.csv")

In [None]:
# pretty plots
result = pd.DataFrame()
models = [lr,poly,gpr,svr,nn3]
test_id = verif_id.iloc[[19,13,10,8,4,2]]
n = 18
num_y = 3
num_x = 2
fig,ax = plt.subplots(num_y,num_x,figsize=(8,5*num_y),constrained_layout=True)
fig.suptitle("Dataset Alpha Test Set")
# for index,id in enumerate(test_id):
for i in range(num_y):
    for j in range(num_x):
        id = test_id.iloc[i*num_x+j]
        data = verif[verif["accure_id"]==id]
        actual = data['soh'].values
        # period = data.shape[0]-n
        period = 12
        input = data.iloc[period:].drop(["accure_id","dsoh","soh","month"],axis=1).values
        input = sc.transform(input)
        ax[i,j].plot(actual)
        for model in models:
            y  = model.predict(input).flatten()
            pred_soh = actual[0:period]
            for q in range(0,len(y)):
                pred_soh = np.append(pred_soh,pred_soh[-1]+y[q])
            rmse_d = rms(data['dsoh'][period:],y)
            rmse_s = rms(actual,pred_soh)
            result = result.append(
                {"model":model.__class__.__name__,"accure_id":id,"RMSE_target":rmse_d,"RMSE_soh":rmse_s,
                "Final_diff":(np.abs(actual[-1]-pred_soh[-1]))},ignore_index=True)
            ax[i,j].plot(np.arange(period-1,len(pred_soh)),pred_soh[period-1:],'^-')
        ax[i,j].set_ylabel("SOH (%)")
        ax[i,j].set_xlabel("Age (Month)")
        ax[i,j].set_title(f"Test ID:{id}")
        ax[i,j].legend(["Actual","Linear","Polynomial","GPR","SVR","MLPR"])
display(result.groupby('model').mean())
# result.groupby('model').mean().to_csv(f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/test_senec.csv")

In [None]:
from sklearn.model_selection import cross_validate
result = pd.DataFrame()
cv = 4

pred = lr.predict(x_test)
stats = pd.DataFrame(cross_validate(lr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Linear','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = ridge.predict(x_test)
stats = pd.DataFrame(cross_validate(ridge,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Ridge','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = lasso.predict(x_test)
stats = pd.DataFrame(cross_validate(lasso,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Lasso','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = en.predict(x_test)
stats = pd.DataFrame(cross_validate(en,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'ElasticNet','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = poly.predict(x_test)
stats = pd.DataFrame(cross_validate(poly,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Polynomial (deg=2)','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = gpr.predict(x_test)
stats = pd.DataFrame(cross_validate(lr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'GPR','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = svr.predict(x_test)
stats = pd.DataFrame(cross_validate(svr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'SVR','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = pcr.predict(x_test) 
stats = pd.DataFrame(cross_validate(pcr,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'PCA','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = pls.predict(x_test).flatten()
stats = pd.DataFrame(cross_validate(pls,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'PLS','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

pred = nn3.predict(x_test) 
stats = pd.DataFrame(cross_validate(nn3,x_shuffle,y_shuffle,cv=cv)).describe()
result = result.append({'model':'Neural Network (3 Layer)','RMSE':rms(y_test,pred),
    'R2':metrics.r2_score(y_test,pred),'CV_runtime':stats.fit_time[1]+stats.score_time[1],
    'CV_score':stats.test_score[1],'CV_std':stats.test_score[2]},ignore_index=True)

result

In [None]:
pd.concat([result,result.rank(axis=0)],axis=1).to_csv(f"/Users/kenny/Library/CloudStorage/OneDrive-ACCUREBatteryIntelligenceGmbH/Thesis/data/results_{customer}.csv")