In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# ML models - KNN and Random Forest
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    mean_squared_error, r2_score
)
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load and PreProcess the data of Battery RUL
# function implement -for loadandpreprocess fromCSV file
def loaddata(filepath):
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded from file Successfully.")
        print(f"Shape of the data:", df.shape)
        print(f"Columns:", list(df.columns))
        print(f"Data Information:\n", df.info())
        df.drop(['Discharge Time (s)', 'Decrement 3.6-3.4V (s)', 'Time at 4.15V (s)', 'Time constant current (s)'], axis =1 , inplace = True)
        #check for Null or missing values
        print(f"MissingValues:\n", df.isnull().sum())
        #statistics Summary - using DEscribe function
        print(f"Statistics Information from File:\n", df.describe())
        corr = df.corr()
        # Feature matrix
        X = df.iloc[:, :-1]
        # target variable is last column- RUL
        y = df.iloc[:,-1]
        return X, y, corr
    except Exception as e:
        print(f"Error loading data from file: {e}")
        return None, None

In [3]:
# Function Preprocess the features and target variable fetch from load data above function
def processfeaturetargetdata(X, y):
    X = pd.get_dummies(X)
    scalar = StandardScaler()
    X_scaled = scalar.fit_transform(X)
    return X_scaled, y

In [4]:
# Function to split the data into train and test
def splitdata(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    print(f"Training Set of Data:\n", X_train.shape)
    print(f"Test Set of Data:\n", X_test.shape)
    return X_train, X_test, y_train, y_test

In [5]:
# Train the model using machine learning model such as KNN and random Forest
def trainmodelML(X_train, y_train, modelType):
    if modelType == 'knn':
        model = KNeighborsClassifier(n_neighbors = 59)
    elif modelType == 'randomForest':
        model = RandomForestClassifier()
    else:
        raise ValueError("Please check the modelType")
    model.fit(X_train, y_train)
    return model

In [6]:
# model performance metrics used to evaluate the ML model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("\nModel Evaluation:")
    print(f"Accuracy: {accuracy:.4f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("Mean Squared Error (MSE):", mse)
    print("Rsquared (R²):", r2)

    return {
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix,
        'Mean Squared Error (MSE)': mse,
        'Rsquared (R²)': r2
    }

def visualize_results(model, X_test, y_test):

    plt.figure(figsize=(8, 8))
    conf_matrix = confusion_matrix(y_test, model.predict(X_test))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

In [None]:
# main method to use above functions such as
#- process the data, train, test and split the data , apply Machine learning model , evaluate using metrics
# finally visualize the results
def main(filepath):
    X, y,corr = loaddata(filepath)
    if X is None or y is None:
        print(f"check the file path: Failed to access or load the data.")
        return
    X_scaled, y = processfeaturetargetdata(X, y)
    X_train, X_test, y_train, y_test = splitdata(X_scaled, y)
    model = trainmodelML(X_train, y_train, 'knn')
    evaluate_model(model, X_test, y_test)

    plt.figure(figsize=(8, 6))
    correlation = corr
    sns.heatmap(correlation, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title('Correlation Heatmap', fontsize=16)
    visualize_results(model, X_test, y_test)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()


if __name__ == '__main__':
    main("sample_data/Battery_RUL.csv")

Data loaded from file Successfully.
Shape of the data: (15064, 9)
Columns: ['Cycle_Index', 'Discharge Time (s)', 'Decrement 3.6-3.4V (s)', 'Max. Voltage Dischar. (V)', 'Min. Voltage Charg. (V)', 'Time at 4.15V (s)', 'Time constant current (s)', 'Charging time (s)', 'RUL']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15064 entries, 0 to 15063
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Cycle_Index                15064 non-null  float64
 1   Discharge Time (s)         15064 non-null  float64
 2   Decrement 3.6-3.4V (s)     15064 non-null  float64
 3   Max. Voltage Dischar. (V)  15064 non-null  float64
 4   Min. Voltage Charg. (V)    15064 non-null  float64
 5   Time at 4.15V (s)          15064 non-null  float64
 6   Time constant current (s)  15064 non-null  float64
 7   Charging time (s)          15064 non-null  float64
 8   RUL                        15064 non-null  int64  
dty