# Machine Learning Pipeline for cycle life prediction

last edit: 28.03.2022

This Notebook reads in data from processed files, generates model features, and makes pipelines for different machine learning models for predicting cycle life. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, random

from scipy.stats import skew, kurtosis

In [2]:
summary = []
c10 = []
c100 = []
Qd100_10 = []

dir_summary = r"C:\Users\ife12216\OneDrive - Institutt for Energiteknikk\Documents\Masteroppgave\ML_github\data\interim\summary"
dir_c10 = r"C:\Users\ife12216\OneDrive - Institutt for Energiteknikk\Documents\Masteroppgave\ML_github\data\interim\cycles_interpolated\cycle10_discharge"
dir_c100 = r"C:\Users\ife12216\OneDrive - Institutt for Energiteknikk\Documents\Masteroppgave\ML_github\data\interim\cycles_interpolated\cycle100_discharge"
dir_Qd100_10 = r"C:\Users\ife12216\OneDrive - Institutt for Energiteknikk\Documents\Masteroppgave\ML_github\data\interim\cycles_interpolated\DeltaQ100_10_discharge"

for file in range(len(os.listdir(dir_summary))):
    f1 = os.path.join(dir_summary, os.listdir(dir_summary)[file])
    summary.append(pd.read_csv(f1))
    
    f2 = os.path.join(dir_c10, os.listdir(dir_c10)[file])
    c10.append(pd.read_csv(f2))
    
    f3 = os.path.join(dir_c100, os.listdir(dir_c100)[file])
    c100.append(pd.read_csv(f3))
    
    f4 = os.path.join(dir_Qd100_10, os.listdir(dir_Qd100_10)[file])
    Qd100_10.append(pd.read_csv(f4))
    
print(len(summary), len(c10), len(c100), len(Qd100_10))

137 137 137 137


In [23]:
times = []
for i in range(len(summary)):
    if(summary[i].index[-1] > 250):
        times.append(summary[i].index[-1])
print(len(times))

135


# Feature Generation

The features can be split into three categories:

Features based on $\Delta Q_{100-10}(V)$
- f1: min$(\Delta Q_{100-10}(V))$
- f2: mean$(\Delta Q_{100-10}(V))$
- f3: var$(\Delta Q_{100-10}(V))$
- f4: skewness$(\Delta Q_{100-10}(V))$
- f5: kurtosis$(\Delta Q_{100-10}(V))$

Features based on the discharge capacity fade curves
- f6: Slope of the linear fit to the capacity fade curve, cycles 2 to 100
- f7: Intercept of the linear fit to capacity fade curve, cycles 2 to 100
- f8: Slope of the linear fit to the capacity fade curve, cycles 91 to 100
- f9: Intercept of the linear fit to capacity fade curve, cycles 91 to 100
- f10: Discharge capacity, cycle 2
- f11: Difference between max discharge capacity and cycle 2
- f12: Discharge capacity, cycle 100

Other features
- f13: Average charge time, first 5 cycles
- f14: Maximum temperature, cycles 2 to 100
- f15: Minimum temperature, cycles 2 to 100
- f16: Integral of temperature over time, cycles 2 to 100
- f17: Internal resistance, cycle 2
- f18: Minimum internal resistance, cycles 2 to 100
- f19: Internal resistance, difference between cycle 100 and cycle 2



The features are extracted in the cell below.

In [27]:
p = 19 # number of features, p
n = len(summary) # number of samples, n

# Target vector
y = np.zeros((n)) # (samples)

# Design matrix
X = np.zeros((n,p)) # (samples x features)

# Q100-10 features f1-f5
for i in range(len(Qd100_10)):  
    X[i,0] = np.log10(abs(np.amin(Qd100_10[i]['discharge_capacity'].values)))
    X[i,1] = np.log10(abs(np.mean(Qd100_10[i]['discharge_capacity'].values)))
    X[i,2] = np.log10(float(np.var(Qd100_10[i]['discharge_capacity'].values)))
    X[i,3] = np.log10(abs(skew(Qd100_10[i]['discharge_capacity'].values)))
    X[i,4] = np.log10(abs(kurtosis(Qd100_10[i]['discharge_capacity'].values)))

# Per cycle features
for i in range(len(summary)):
    y[i] = np.log10(summary[i].index[-1])
    
    slope100_2 = float((summary[i]['discharge_capacity'][99] - summary[i]['discharge_capacity'][1])/98) #dQd / dcycle
    slope100_91 = float((summary[i]['discharge_capacity'][99] - summary[i]['discharge_capacity'][90])/9)
    intercept100_2 = summary[i]['discharge_capacity'][9] - slope100_2*10 # b = y-mx
    intercept100_91 = summary[i]['discharge_capacity'][9] - slope100_91*10 # b = y-mx
    diff = np.amax(summary[i]['discharge_capacity'][1:-1]) - summary[i]['discharge_capacity'][1]
    
    # Capacity fade features
    X[i,5] = slope100_2
    X[i,6] = intercept100_2
    X[i,7] = slope100_91
    X[i,8] = intercept100_91
    X[i,9] = summary[i]['discharge_capacity'][1]
    X[i,10] = diff
    X[i,11] = summary[i]['discharge_capacity'][99]
    
    # Other features
    X[i,12] = np.mean(summary[i]['charge_duration'][1:6])
    X[i,13] = np.amax(summary[i]['temperature_maximum'][1:99])
    X[i,14] = np.amin(summary[i]['temperature_minimum'][1:99])
    X[i,15] = np.mean(summary[i]['time_temperature_integrated'][1:99])
    X[i,16] = summary[i]['dc_internal_resistance'][1]
    X[i,17] = np.amin(summary[i]['dc_internal_resistance'][1:99])
    X[i,18] = summary[i]['dc_internal_resistance'][99] - summary[i]['dc_internal_resistance'][1]

print(X.shape)
print(y.shape)

(137, 19)
(137,)


### Make Pipelines

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import ElasticNetCV
from sklearn.gaussian_process.kernels import ExpSineSquared
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold

from functions import get_errors

In [56]:
# Make dictionary of hyperparameters?

# Elastic net pipeline
alphas = [0.1,0.3,0.5,0.7,0.9,0.95,0.99,1]

pipe_EN = Pipeline([('scaler', StandardScaler()), ('estimator', ElasticNetCV(cv=5, l1_ratio=alphas, max_iter=10000))])

# Random Forest pipeline
pipe_RF = Pipeline([('scaler', StandardScaler()), ('estimator', RandomForestRegressor())])

# Gaussian Process Regression
kernel = ExpSineSquared()
pipe_GPR = Pipeline([('scaler', StandardScaler()), ('estimator', GaussianProcessRegressor(kernel=kernel, alpha=8))])

# Support Vector Machine
pipe_SVR = Pipeline([('scaler', StandardScaler()), ('estimator', SVR())])

# Neural Network
pipe_MLP = Pipeline([('scaler', StandardScaler()), ('estimator', MLPRegressor(max_iter=1000))])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # Find out stratify when time

pipelines = [pipe_EN, pipe_RF, pipe_GPR, pipe_SVR, pipe_MLP]
pipeline_names = {0:'Elastic Net', 1:'Random Forest', 2:'Gaussian Process Regression', 3:'Support Vector Machine', 4:'Multi-Layer Perceptron'}

#predictions = np.zeros((len(pipelines)+1, n))

for i in range(len(pipelines)): 
    pipelines[i].fit(X_train, y_train)
    ypred_train = pipelines[i].predict(X_train)
    ypred_test = pipelines[i].predict(X_test)
    
    errors = get_errors(y_train, y_test, ypred_train, ypred_test)
    errors_table = pd.DataFrame({'RMSE': [errors[0], errors[1]], 'RMSE CYCLES': [errors[2],errors[3]],\
                       'R2 SCORE': [errors[4],errors[5]] ,'MAPE': [errors[6],errors[7]]}, index=['train', 'test'])
    
    print("")
    print(pipeline_names[i])
    display(errors_table)


Elastic Net


Unnamed: 0,RMSE,RMSE CYCLES,R2 SCORE,MAPE
train,0.003481,151.666989,0.893863,0.016308
test,0.007659,207.011026,0.752823,0.022089



Random Forest


Unnamed: 0,RMSE,RMSE CYCLES,R2 SCORE,MAPE
train,0.000856,72.770079,0.973914,0.006668
test,0.010557,182.611942,0.65929,0.025052



Gaussian Process Regression


Unnamed: 0,RMSE,RMSE CYCLES,R2 SCORE,MAPE
train,0.082453,513.334332,-1.514012,0.079652
test,0.093845,534.839041,-2.028809,0.089721



Support Vector Machine


Unnamed: 0,RMSE,RMSE CYCLES,R2 SCORE,MAPE
train,0.004119,135.239822,0.874418,0.019774
test,0.013135,203.508361,0.576086,0.027674



Multi-Layer Perceptron


Unnamed: 0,RMSE,RMSE CYCLES,R2 SCORE,MAPE
train,0.037432,432.636959,-0.141321,0.05518
test,0.950864,824311.588883,-29.688846,0.213969
