In [173]:
# --- INSTALLING REQUIRED PACKAGES ---
# Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Pre-processing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Model libraries
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import xgboost as xgb

In [356]:
# --- LOAD DATA AND REMOVE UNWANTED ROWS AND COLUMNS ---
def prep_csv(file_path):
    
    data = pd.read_csv(file_path, header=None)
    
    # Name stored in line 2, removing all other header lines
    data_red_row = data.drop(index=[0]+list(range(2,11))).reset_index(drop=True) 
    
    # First column contains time stamps and is saved separately and used for indexing in DataFrame
    time_stamp = data_red_row.iloc[1:, 0]
    
    # Remove every third column starting from the third
    data_red_col = data_red_row.iloc[:, 2::3]

    # Get variable names from line 2 (index 1)
    column_names = data_red_col.iloc[0].values
    data_red_col = data_red_col.drop(data_red_col.index[0], axis=0)
    
    # Setting new indices and column names
    data_red = data_red_col.set_index(time_stamp)
    data_red.columns = column_names
    
    return data_red, time_stamp


# --- SPLIT AND PRE-PROCESS DATA ---
def split(data,testsize):
    """
    Preprocess the data by splitting into features (X) and target (y),
    and splitting into training and test sets, assuming y is the last column of "data", 
    and assigning a test size "testsize". 
    Optional feature scaling by assigning scaling = 1 (ON) or = 0 (OFF). 
    """
    
    # Assume the last column is the target variable
    X = np.array(data.iloc[:, :-1])
    y = np.array(data.iloc[:, -1])

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=testsize, shuffle=False, random_state=42)

    return X_train, X_test, y_train, y_test

def scale(X_train, X_test):   
    # Feature scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test

# --- TRAIN AND EVALUATE REGRESSION MODELS ---
def evaluate_models(X_train, X_test, y_train, y_test):
    """
    Train different regression models 
        - Linear regression
        - Decision Tree
        - Random Forest
        - Support Vector Machine
        - Gradient Boosting
        - XGBoost
        - Nearest Neighbor
    and evaluate their performance on the test data using metrics MAE, MSE, RMSE, R2.
    """

    # Define models
    models = {
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "Support Vector Machine": SVR(),
        "Gradient Boosting": GradientBoostingRegressor(),
        "XGBoost": xgb.XGBRegressor(),
        "Nearest neighbor": KNeighborsRegressor()
    }

    # Evaluate each model
    results = {}
    predictions = {}

    for name, model in models.items():
        # Train the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Store predictions for plotting later
        predictions[name] = y_pred

        # Calculate evaluation metrics
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, y_pred)

        # Store the results
        results[name] = {
            "MAE": mae,
            "MSE": mse,
            "RMSE": rmse,
            "R2": r2
        }

    return results, predictions

# --- PLOT RESULTS ---
def plot_predictions(y_test, predictions):
    """
    Generates two plots: (1) a time series plot of the actual and predicted values for each 
    model, evaluated on test data, and (2) a true vs predicted scatter plot to evaluate under or 
    overestimated ranges. 
    """
    # TIME SERIES PLOT
    # Defining figure size and axis
    plt.figure(figsize=(12, 8))
    ax = np.linspace(0, len(y_test)-1, num=len(y_test))

    # Plot test data
    plt.plot(ax,y_test, label="Actual", color="black", linestyle="--", linewidth=2)

    # Plot predictions from each model
    for model_name, y_pred in predictions.items():
        plt.plot(ax,y_pred, label=f"{model_name} Prediction")

    # Plotting the plot with titles and lables
    plt.title("Model outputs and true values")
    plt.xlabel("Time Step")
    plt.legend()
    plt.grid(True)
    plt.show()

    # 2 TRUE VS. PREDICTED
    # Defining second figure and axises
    plt.figure(figsize=(12, 8))
    ax = np.linspace(min(y_test), max(y_test), num=len(y_test))

    # Plot the ideal predictor (y=x)
    plt.plot(ax,ax, label="Ideal predictor", color="black", linestyle="-", linewidth=2)

    # Plot the true predictions vs predictions from each model
    for model_name, y_pred in predictions.items():
        plt.scatter(y_test,y_pred, label=f"{model_name} prediction")

    # Plotting the plot with titles and lables
    plt.title("True vs. Predicted")
    plt.legend()
    plt.grid(True)
    plt.show()

# --- MAIN FUNCTION ---
def main(file_path):
    """
    Main function to load data, preprocess, train and evaluate models, and plot predictions.
    """
    # LOAD AND PREP the data
    data_red, time_stamp = prep_csv(file_path)

    # SPLIT the data
    X_train, X_test, y_train, y_test = split(data_red,0.2)

    # PRE-PROCESS the data
#    X_train, X_test = scale(X_train,X_test)

    # TRAIN models
#    results, predictions = evaluate_models(X_train, X_test, y_train, y_test)

    # EVALUATE models
#    for model_name, metrics in results.items():
#        print(f"\n{model_name} Results:")
#        for metric, value in metrics.items():
#            print(f"{metric}: {value:.4f}")

    # PLOT predictions vs actual data
#    plot_predictions(y_test, predictions)

    return X_train, y_train

In [358]:
# --- RUN SCRIPT ---
if __name__ == "__main__":
    # Provide the path to the CSV file here
    file_path = "test_data.csv"
    X_train, y_train = main(file_path)

In [365]:
X_train

array([['3.29718661', '1332.858276', '51.95389557', ..., '0.01311027',
        '39.09383774', '2.56769371'],
       ['3.34081054', '1337.174316', '51.62177658', ..., '0.01652091',
        '30.37085533', '2.55867982'],
       ['3.37642503', '1335.556396', '51.21917725', ..., '0.01356117',
        '35.04940033', '2.54900098'],
       ...,
       ['3.15371203', '1342.440308', '48.04307175', ..., '-0.00076618',
        '17.42031097', '3.01304388'],
       ['3.21875119', '1346.264526', '48.35634232', ..., '-0.00208029',
        '21.13958168', '2.94233251'],
       ['3.18850064', '1346.939697', '48.12757492', ..., '-0.00215985',
        '22.03313255', '2.9848628']], dtype=object)

In [5]:
data_red_col

Unnamed: 0,0,3
0,1,4
1,1,4
2,1,4
