## 1. Import Data

This step brings in important tools that help us work with data, create machine learning models, and make charts.

In [39]:
import pandas as pd
import numpy as np
import joblib
import os
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from IPython.display import display
from google.colab import drive

## 2. Load Data

Here, we load data from an Excel file, show the first 10 rows to check, and handle any issues if the file doesn’t load properly.


In [40]:
drive.mount('/content/drive')

# Load Data Function
def load_data(file_path):
    """Load data from an Excel file and display the first 10 rows."""
    try:
        data = pd.read_excel(file_path, engine='openpyxl')
        print("Data loaded successfully.")
        print("First 10 rows of the data:")
        display(data.head(10))
        return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

file_path = '/content/drive/MyDrive/KSU/RCapstone/feature-prediction/data/all_data.xlsx'  # Update this to the correct path
data = load_data(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data loaded successfully.
First 10 rows of the data:


Unnamed: 0,Name,Timestamp,Status,Description,Vibration Frequency,Vibration Amplitude,Bearing Temperature,Motor Temperature,Belt Load,Torque,Noise Levels,Current and Voltage,Hydraulic Pressure,Belt Thickness,Roller Condition
0,Conveyor Belt 4,2023-08-15 00:00:00,Running,,1490.82,0.04,,96.902,1.36,318.07,55.12,15.79,382.09,1.58,86.0
1,Conveyor Belt 4,2023-08-15 00:15:00,Running,,1498.37,0.04,77.076,96.975,1.07,295.5,59.68,14.34,376.48,1.5795,85.854
2,Conveyor Belt 4,2023-08-15 00:30:00,Running,,1503.22,0.06,77.307,96.755,1.21,314.38,58.2,15.03,384.2,1.57925,85.781
3,Conveyor Belt 4,2023-08-15 00:45:00,Running,,1508.11,0.04,77.474,97.661,1.29,311.84,56.16,15.43,379.79,1.579,85.708
4,Conveyor Belt 4,2023-08-15 01:00:00,Running,,1498.13,0.06,77.785,97.471,1.07,317.14,55.39,14.35,383.95,1.57875,85.635
5,Conveyor Belt 4,2023-08-15 01:15:00,Running,,1494.89,0.05,77.813,97.814,1.33,305.58,,15.65,383.02,1.5785,85.562
6,Conveyor Belt 4,2023-08-15 01:30:00,Running,,1498.2,0.06,77.622,97.464,1.18,285.96,59.25,14.91,383.54,1.57825,85.489
7,Conveyor Belt 4,2023-08-15 01:45:00,Running,,1507.17,0.06,78.487,98.139,1.24,298.84,58.93,15.21,377.37,1.578,85.416
8,Conveyor Belt 4,2023-08-15 02:00:00,Running,,1507.43,0.06,78.294,97.877,1.07,307.51,61.77,14.36,377.28,1.57775,85.343
9,Conveyor Belt 4,2023-08-15 02:15:00,Running,,1502.35,0.05,78.561,98.96,1.17,308.73,62.23,14.87,377.47,1.5775,85.27


## 3. PreProcess Data

We clean the data by converting everything into numbers, replacing missing values with zeros, and ensuring it's ready for the model.

In [46]:
def preprocess_data(data):
    # Lag features
    for lag in range(1, 3):  # Simpler lag for testing
        data[f'Vibration Frequency lag {lag}'] = data['Vibration Frequency'].shift(lag)

    # Drop NaNs
    data.dropna(inplace=True)
    return data



## 4. Define Model

This step creates a simple neural network with input, hidden, and output layers that will help the model learn and make predictions.

In [45]:
# Create minimal sample data
data = pd.DataFrame({
    'Timestamp': pd.date_range(start='2023-01-01', periods=100, freq='D'),
    'Vibration Frequency': np.random.rand(100),
    'Vibration Amplitude': np.random.rand(100),
    'Bearing Temperature': np.random.rand(100) * 100
})

data.set_index('Timestamp', inplace=True)


def build_model(input_shape):
    """Build a simple neural network model."""
    inputs = Input(shape=(input_shape,))
    x = Dense(128, activation='relu')(inputs)
    x = Dense(64, activation='relu')(x)
    outputs = Dense(1, activation='linear')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

## 5. Train Model
The data is scaled and split into training and test sets. We train the neural network for each target column, saving models and tools for future use, and stopping early if the model starts overfitting.


In [2]:
# Train Model Function
def train_model(data, target_column):
    # Preprocess the data
    print(f"Preprocessing data for {target_column}...")
    data = preprocess_data(data)

    # Check the structure of the processed data
    print("Processed Data Columns:", data.columns)

    model_dir = '/content/drive/MyDrive/KSU/RCapstone/feature-prediction/models'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Select all numeric columns for features
    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    print("Numeric Columns Available for Features:", numeric_columns)

    # Ensure target_column exists in numeric columns
    if target_column not in numeric_columns:
        raise ValueError(f"Target column '{target_column}' is not numeric or does not exist.")

    # Drop the target column from features
    X = data[numeric_columns].drop(columns=[target_column])
    y = data[target_column]  # Ensure target is numeric

    # Print shapes of X and y before scaling
    print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)  # Scale only numeric features

    # Train-test split (in time series, consider last n as test)
    split_index = int(len(X_scaled) * 0.8)  # 80% for training
    X_train, X_test = X_scaled[:split_index], X_scaled[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    # Print shapes of training and testing sets
    print(f"Shape of X_train: {X_train.shape}, Shape of y_train: {y_train.shape}")

    model = build_model(X_train.shape[1])  # Input shape for the model
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    print(f"Starting training for {target_column}...")
    history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                        validation_split=0.2, callbacks=[early_stopping], verbose=1)

    print(f"Training finished for {target_column}.")

    # Save the model and feature names
    model.save(f'{model_dir}/{target_column}_model.keras')
    joblib.dump(X.columns.tolist(), f'{model_dir}/{target_column}_feature_names.pkl')

    return model, scaler, history  # Return history as well


In [50]:
import pandas as pd
import numpy as np
import joblib
import os
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from IPython.display import display
from google.colab import drive

# Load the Drive
drive.mount('/content/drive')

# Load Data Function
def load_data(file_path):
    """Load data from an Excel file and display the first 10 rows."""
    try:
        data = pd.read_excel(file_path, engine='openpyxl')
        print("Data loaded successfully.")
        print("First 10 rows of the data:")
        display(data.head(10))
        return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Preprocess Data Function
def preprocess_data(data):
    """Preprocess the data: handle numeric columns, create lagged features, and fill NaN values."""
    # Convert Timestamp column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce')

    # Check for any NaT values
    if data['Timestamp'].isna().any():
        print("Invalid datetime entries found in the Timestamp column:")
        print(data[data['Timestamp'].isna()])  # Display rows with invalid datetime values

    # Set Timestamp as index
    data.set_index('Timestamp', inplace=True)

    # Create lagged features for 'Vibration Frequency' (can create for other features as needed)
    for lag in range(1, 5):
        data[f'Vibration Frequency lag {lag}'] = data['Vibration Frequency'].shift(lag)

    # Convert all columns to numeric
    data = data.apply(pd.to_numeric, errors='coerce')
    data.fillna(0, inplace=True)

    # Drop rows with NaN values created by shifting
    data.dropna(inplace=True)

    return data

# Build Model Function
def build_model(input_shape):
    """Build a simple neural network model."""
    inputs = Input(shape=(input_shape,))
    x = Dense(128, activation='relu')(inputs)
    x = Dense(64, activation='relu')(x)
    outputs = Dense(1, activation='linear')(x)
    model = Model(inputs, outputs)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Train Model Function
def train_model(data, target_column):
    # Preprocess the data
    data = preprocess_data(data)

    # Check the structure of the processed data
    print("Processed Data Columns:", data.columns)

    model_dir = '/content/drive/MyDrive/KSU/RCapstone/feature-prediction/models'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    # Select only numeric columns for X
    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    print("Numeric Columns Available for Features:", numeric_columns)

    # Ensure target_column exists in numeric columns
    if target_column not in numeric_columns:
        raise ValueError(f"Target column '{target_column}' is not numeric or does not exist.")

    # Drop the target column from features (do not drop Timestamp)
    X = data[numeric_columns].drop(columns=[target_column])
    y = data[target_column]  # Ensure target is numeric

    # Scale features
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)  # Scale only numeric features

    # Train-test split (in time series, consider last n as test)
    split_index = int(len(X_scaled) * 0.8)  # 80% for training
    X_train, X_test = X_scaled[:split_index], X_scaled[split_index:]
    y_train, y_test = y[:split_index], y[split_index:]

    model = build_model(X_train.shape[1])  # Input shape for the model
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    print(f"Starting training for {target_column}...")
    history = model.fit(X_train, y_train, epochs=100, batch_size=32,
                        validation_split=0.2, callbacks=[early_stopping], verbose=1)

    print(f"Training finished for {target_column}.")

    # Save the model and feature names
    model.save(f'{model_dir}/{target_column}_model.keras')
    joblib.dump(X.columns.tolist(), f'{model_dir}/{target_column}_feature_names.pkl')

    return model, scaler, history

# Function to Plot Training History
def plot_all_histories(histories):
    """Plot the training and validation loss over epochs for each model after training."""
    for target, history in histories.items():
        plt.figure(figsize=(12, 6))
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'Training and Validation Loss for {target}')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.show()

# Predict Last Rows Function
def predict_last_rows(models, data, scalers, target_columns):
    """Predict the last 10 rows of the training data and compare predictions to actual values."""
    last_n_rows = data.tail(10)
    results = []

    for target in target_columns:
        # Load feature names
        feature_names = joblib.load(f'{model_dir}/{target}_feature_names.pkl')

        # Prepare the last row features
        X_last = last_n_rows[feature_names].values
        X_last_scaled = scalers[target].transform(X_last)

        # Make the prediction
        prediction = models[target].predict(X_last_scaled)

        # Store predictions
        results.append((target, prediction))

    # Create a DataFrame from results
    results_df = pd.DataFrame(results, columns=['Target', 'Prediction'])
    print("Predicted values for the last 10 rows:")
    display(results_df)

# Predict Next Rows Function
def predict_next_rows(models, data, scalers, target_columns, num_predictions=10):
    """Predict the next num_predictions rows based on the trained data."""

    last_row = data.tail(1).copy()  # Use the last row to predict new values
    predicted_results = []

    for target in target_columns:
        # Load feature names
        feature_names = joblib.load(f'{model_dir}/{target}_feature_names.pkl')

        # Prepare for predictions
        for _ in range(num_predictions):
            # Select the last row features
            X_last = last_row[feature_names].values
            X_last_scaled = scalers[target].transform(X_last)

            # Make the prediction
            prediction = models[target].predict(X_last_scaled)

            # Store the predicted value
            predicted_results.append((target, prediction[0][0]))

            # Update the last_row for the next prediction
            last_row = pd.DataFrame([last_row.values.flatten().tolist() + [prediction[0][0]]], columns=last_row.columns)
            last_row.index = [pd.to_datetime(last_row.index[0]) + pd.Timedelta(minutes=5)]  # Increment timestamp

    # Create a DataFrame from the predictions
    predicted_df = pd.DataFrame(predicted_results, columns=['Target', 'Prediction'])
    print(f"Predicted next {num_predictions} rows based on the trained data:")
    display(predicted_df)

# Train models for all target columns
target_columns = [
    'Vibration Frequency',
    'Vibration Amplitude',
    'Bearing Temperature',
    'Motor Temperature',
    'Belt Load',
    'Torque',
    'Noise Levels',
    'Current and Voltage',
    'Hydraulic Pressure',
    'Belt Thickness',
    'Roller Condition'
]

models = {}
scalers = {}
histories = {}  # Initialize the histories dictionary

# Track any errors during training
for target in target_columns:
    print(f"Training model for {target}...")
    try:
        model, scaler, history = train_model(data, target)
        models[target] = model
        scalers[target] = scaler
        histories[target] = history  # Store the history for each model

    except KeyError as e:
        print(f"KeyError: {e} - likely due to accessing a dropped column or index.")
    except Exception as e:
        print(f"Error while training model for {target}: {e}")

# Plot all training histories after all models have been trained
plot_all_histories(histories)

# Predicting the last 10


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Training model for Vibration Frequency...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Vibration Amplitude...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Bearing Temperature...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Motor Temperature...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Belt Load...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Torque...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Noise Levels...
KeyError: 'Timestamp' - likely due to accessing a dropped column or index.
Training model for Current and Voltage...
KeyError: 'Timestamp' - likely due to acce

# 6. Display Training History

This step shows the loss (errors) during training and validation to help us understand how well the model is learning.

In [47]:
def plot_history(history, target):
    """Plot the training and validation loss over epochs for each model."""
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title(f'Training and Validation Loss for {target}')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

for target, history in histories.items():
    plot_history(history, target)


# 7.Evaluate Model

The model predicts the last 10 rows of training data, compares predictions to actual values, and uses colors to highlight how close the predictions are to the real values.

In [7]:
# 7. Evaluate Model
def predict_last_rows(models, data, scalers, target_columns):
    """Predict the last 10 rows of the training data and compare predictions to actual values."""
    last_n_rows = data.tail(10)
    results = []
    model_dir = '/content/drive/MyDrive/KSU/RCapstone/feature-prediction/models'

    for i in range(len(last_n_rows)):
        original_row = {'Type': 'Original'}
        predicted_row = {'Type': 'Predicted'}

        for target in target_columns:
            feature_names = joblib.load(f'{model_dir}/{target}_feature_names.pkl')
            X_last = last_n_rows[feature_names].iloc[[i]]
            X_last_scaled = scalers[target].transform(X_last)
            prediction = models[target].predict(X_last_scaled)

            original_row[target] = last_n_rows.iloc[i][target]
            predicted_row[target] = prediction[0]

        results.append(original_row)
        results.append(predicted_row)

    results_df = pd.DataFrame(results)
    display(results_df)

# Use the function to evaluate the predictions
predict_last_rows(models, processed_data, scalers, target_columns)


NameError: name 'models' is not defined

## 8. Predict Next 10 rows

Based on the last row of data, the model predicts the next 10 rows for each column and displays the predicted results.

In [None]:
def predict_next_rows_from_training(models, data, scalers, target_columns, num_predictions=10):
    """Predict the next num_predictions rows based on the trained data, using previous predictions."""

    # Convert the timestamp column to datetime if it's still in string format
    if data.index.dtype == 'object':  # Check if index is still string
        data.index = pd.to_datetime(data.index)

    # Use the last row to predict new values
    last_row = data.tail(1).copy()  # Get the last row for predictions
    model_dir = '/content/drive/MyDrive/KSU/RCapstone/feature-prediction/models'
    predicted_results = []  # To store the results

    # Predict for the specified number of future time steps
    for i in range(num_predictions):
        predicted_row = {}

        for target in target_columns:
            feature_names = joblib.load(f'{model_dir}/{target}_feature_names.pkl')
            X_last = last_row[feature_names]

            # Scale the features
            X_last_scaled = scalers[target].transform(X_last)

            # Make the prediction
            prediction = models[target].predict(X_last_scaled)

            # Extract the predicted value
            predicted_value = prediction[0].item() if isinstance(prediction[0], (list, np.ndarray)) else prediction[0]
            predicted_row[target] = predicted_value

            # Update the last_row for the next prediction
            last_row[target] = predicted_value  # This will help to maintain continuity

        # Create a new timestamp for each prediction
        new_timestamp = last_row.index[0] + pd.Timedelta(minutes=5)  # Ensure index is accessed correctly
        predicted_row['timestamp'] = new_timestamp

        # Append predicted_row to results
        predicted_results.append(predicted_row)

        # Update the last_row to include the new prediction for the next iteration
        last_row = pd.DataFrame([predicted_row]).set_index('timestamp')

    # Create a DataFrame from the predicted results
    predicted_df = pd.DataFrame(predicted_results)
    predicted_df.set_index('timestamp', inplace=True)

    print(f"Predicted next {num_predictions} rows based on the trained data:")
    display(predicted_df)  # Use display to show the DataFrame

    return predicted_df

# Call the function to get predicted results for the next rows
predicted_next_df = predict_next_rows_from_training(models, processed_data, scalers, target_columns, num_predictions=10)
