In [1]:
%matplotlib inline
!wget https://phm-datasets.s3.amazonaws.com/NASA/5.+Battery+Data+Set.zip -O battery_data.zip
!unzip -o battery_data.zip 
!rm battery_data.zip
!cd "5. Battery Data Set" 
!find "5. Battery Data Set" -name "*.zip" -execdir unzip -o {} \;

wget: option '--q' is ambiguous; possibilities: '--quiet' '--quota'
Usage: wget [OPTION]... [URL]...

Try `wget --help' for more options.
unzip:  cannot find or open battery_data.zip, battery_data.zip.zip or battery_data.zip.ZIP.
rm: cannot remove 'battery_data.zip': No such file or directory
Archive:  ./3. BatteryAgingARC_25-44.zip
  inflating: README_33_34_36.txt     
  inflating: README_38_39_40.txt     
  inflating: README_41_42_43_44.txt  
  inflating: B0025.mat               
  inflating: B0026.mat               
  inflating: B0027.mat               
  inflating: B0028.mat               
  inflating: B0029.mat               
  inflating: B0030.mat               
  inflating: B0031.mat               
  inflating: B0032.mat               
  inflating: B0033.mat               
  inflating: B0034.mat               
  inflating: B0036.mat               
  inflating: B0038.mat               
  inflating: B0039.mat               
  inflating: B0040.mat               
  inflating: B0041.

In [2]:
import datetime
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy.io import loadmat
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# from sklearn.preprocessing import MinMaxScaler
# from sklearn.metrics import mean_squared_error
# from sklearn import metrics
# import matplotlib.pyplot as plt
# import seaborn as sns

In [3]:
def load_data(battery):
  mat = loadmat('5. Battery Data Set/' + battery + '.mat')
  counter = 0
  dataset = []
  capacity_data = []
  
  for i in range(len(mat[battery][0, 0]['cycle'][0])):
    row = mat[battery][0, 0]['cycle'][0, i]
    if row['type'][0] == 'discharge':
      ambient_temperature = row['ambient_temperature'][0][0]
      date_time = datetime.datetime(int(row['time'][0][0]),
                               int(row['time'][0][1]),
                               int(row['time'][0][2]),
                               int(row['time'][0][3]),
                               int(row['time'][0][4])) + datetime.timedelta(seconds=int(row['time'][0][5]))
      data = row['data']
      capacity = data[0][0]['Capacity'][0][0]

      for j in range(len(data[0][0]['Voltage_measured'][0])):
        voltage_measured = data[0][0]['Voltage_measured'][0][j]
        current_measured = data[0][0]['Current_measured'][0][j]
        temperature_measured = data[0][0]['Temperature_measured'][0][j]
        current_load = data[0][0]['Current_load'][0][j]
        voltage_load = data[0][0]['Voltage_load'][0][j]
        time = data[0][0]['Time'][0][j]
        dataset.append([counter + 1, ambient_temperature, date_time, capacity,
                        voltage_measured, current_measured,
                        temperature_measured, current_load,
                        voltage_load, time])
      
      capacity_data.append([counter + 1, ambient_temperature, date_time, capacity])
      counter = counter + 1

  return [pd.DataFrame(data=dataset,
                       columns=['cycle', 'ambient_temperature', 'datetime',
                                'capacity', 'voltage_measured',
                                'current_measured', 'temperature_measured',
                                'current_load', 'voltage_load', 'time']),
          pd.DataFrame(data=capacity_data,
                       columns=['cycle', 'ambient_temperature', 'datetime',
                                'capacity'])]

t_ds, t_c_ds = load_data('B0005')
v_ds, v_c_ds = load_data('B0006')
print(t_ds.head())
print(v_ds.head())

   cycle  ambient_temperature            datetime  capacity  voltage_measured  \
0      1                   24 2008-04-02 15:25:41  1.856487          4.191492   
1      1                   24 2008-04-02 15:25:41  1.856487          4.190749   
2      1                   24 2008-04-02 15:25:41  1.856487          3.974871   
3      1                   24 2008-04-02 15:25:41  1.856487          3.951717   
4      1                   24 2008-04-02 15:25:41  1.856487          3.934352   

   current_measured  temperature_measured  current_load  voltage_load    time  
0         -0.004902             24.330034       -0.0006         0.000   0.000  
1         -0.001478             24.325993       -0.0006         4.206  16.781  
2         -2.012528             24.389085       -1.9982         3.062  35.703  
3         -2.013979             24.544752       -1.9982         3.030  53.781  
4         -2.011144             24.731385       -1.9982         3.011  71.922  
   cycle  ambient_temperature    

In [4]:
#removing columns with constant values
def remove_constant_columns(ds):
  for col in ds.columns:
    if ds[col].unique().size == 1:
      dropped_cols = ds.drop([col], axis=1, inplace=True)
      print(dropped_cols)
  return ds

t_ds = remove_constant_columns(t_ds)
v_ds = remove_constant_columns(v_ds)

None
None


In [5]:
#dtype inspection
t_ds.drop(['datetime'], axis=1, inplace=True)
v_ds.drop(['datetime'], axis=1, inplace=True)

t_ds.dtypes
v_ds.dtypes

cycle                     int64
capacity                float64
voltage_measured        float64
current_measured        float64
temperature_measured    float64
current_load            float64
voltage_load            float64
time                    float64
dtype: object

In [6]:
t_ds.replace(' ', '_', regex=True, inplace=True)
v_ds.replace(' ', '_', regex=True, inplace=True)

In [7]:
t_X = t_ds.drop(['capacity'], axis=1)
t_X.head()

t_y = t_ds['capacity'].copy()
t_y.head()

0    1.856487
1    1.856487
2    1.856487
3    1.856487
4    1.856487
Name: capacity, dtype: float64

In [8]:
v_X = v_ds.drop(['capacity'], axis=1)
v_X.head()

v_y = v_ds['capacity'].copy()
v_y.head()

0    2.035338
1    2.035338
2    2.035338
3    2.035338
4    2.035338
Name: capacity, dtype: float64

In [9]:
# Create a linear regression object
lr = LinearRegression()

# Train the model using the training data
lr.fit(t_X, t_y)

# Make predictions on the verification data
v_y_pred = lr.predict(v_X)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(v_y, v_y_pred)

print("Mean squared error: ", mse)


Mean squared error:  0.006075879005060876


In [10]:
# Assuming you have trained a linear regression model called "model"
y_pred_train = lr.predict(t_X)
y_pred_test = lr.predict(v_X)

print ("Training set score: {:.2f}".format(lr.score(t_X, t_y)))

print (y_pred_train)
print (y_pred_test)


Training set score: 0.98
[1.90246948 1.91239608 1.88171887 ... 1.24964585 1.25009823 1.25062327]
[1.90234254 1.91225044 1.88159845 ... 1.25658842 1.25692701 1.25721152]


#creds to https://www.kaggle.com/rajeevsharma993