## Import necessary packages

In [None]:
import numpy as np
import pandas as pd
import holoviews as hv
hv.extension('bokeh')
from IPython.display import display
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from tabpfn import TabPFNRegressor
import matplotlib.pyplot as plt

## Load dataset

In [None]:
path = "/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/han/dataframe_engine1_2_forimputation"
df = pd.read_parquet(path)

# filter once outside columns loop since condition is independent of target column
filtered_df = df[(df['fr_eng'] > (10/60))]
filtered_df = filtered_df.sort_values(by='time')

## Mean imputer

In [None]:
from sklearn.impute import SimpleImputer

# Choose a variable as y and other variables as X
target_column_list = ['te_air_scav_rec', 'pr_baro', 'pd_air_ic__0', 'pr_exh_rec', 'pr_air_scav_ecs', 're_eng_load', 'te_seawater']
column_dfs = []

for column in target_column_list:
    # Drop rows with NaN target value
    if column not in filtered_df.columns:
        print(f"  Skipping {column} in current df: column missing.")
        continue

    df_col = filtered_df.dropna(subset=[column])
    if df_col.empty:
        print(f"  Skipping {column} in current df: no data after dropna.")
        continue

    column_dfs.append(df_col)


if not column_dfs:
    raise ValueError("No valid dataframes to concatenate. Check filtering conditions or target columns.")
    
combined_df = pd.concat(column_dfs, ignore_index=True)
df_length = len(combined_df)

# Train-test split
train_df = combined_df.iloc[:int(df_length * 0.8)]
test_df = combined_df.iloc[int(df_length * 0.8):]

# Drop non-numeric columns
numeric_cols = train_df.select_dtypes(include='number').columns
numeric_cols = [col for col in numeric_cols if train_df[col].notnull().any()]

train_numeric = train_df[numeric_cols]
test_numeric = test_df[numeric_cols].copy()

imputer = SimpleImputer(strategy='mean')
imputer.fit(train_numeric)


# Evaluate R² for each target column
for col in target_column_list:
    print(f"\n==== Processing target column: {col} ====")
    if col not in train_numeric.columns:
        print(f"Skipping {col}: not numeric or no valid values in train")
        continue

    # Pick non-missing test values
    non_missing_idx = test_numeric[test_numeric[col].notnull()].index
    if non_missing_idx.empty:
        print(f"Skipping {col}: no non-missing values in test")
        continue

    mask_size = min(20, len(non_missing_idx))
    mask_idx = np.random.choice(non_missing_idx, size=mask_size, replace=False)

    # Save true values and mask them
    true_values = test_numeric.loc[mask_idx, col].copy()
    test_numeric.loc[mask_idx, col] = np.nan

    # Impute test data
    imputed = pd.DataFrame(
        imputer.transform(test_numeric),
        columns=train_numeric.columns,
        index=test_numeric.index
    )
    imputed_values = imputed.loc[mask_idx, col]
    
    # Compute R²
    r2 = r2_score(true_values, imputed_values)

    # Evaluate the model
    mse = mean_squared_error(true_values, imputed_values)
    mae = mean_absolute_error(true_values, imputed_values)

    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R² Score:", r2)

    # Create figure and axes
    fig = plt.figure(figsize=(8, 10))
    grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.5)

    main_ax = fig.add_subplot(grid[1:4, 0:3])
    x_hist = fig.add_subplot(grid[0, 0:3], sharex=main_ax)
    y_hist = fig.add_subplot(grid[1:4, 3], sharey=main_ax)

    # Scatter plot with R²
    main_ax.scatter(true_values, imputed_values, alpha=0.6)
    main_ax.set_xlabel('Original Data')
    main_ax.set_ylabel('Predicted Data')
    main_ax.set_title(f'R² Plot (R² = {r2:.2f})')

    # Add y = x line
    min_val = min(min(true_values), min(imputed_values))
    max_val = max(max(true_values), max(imputed_values))
    main_ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='y = x')
    main_ax.legend()

    # Histograms
    x_hist.hist(true_values, bins=20, alpha=0.7)

    y_hist.hist(imputed_values, bins=20, orientation='horizontal', alpha=0.7)

    # Show the plot
    plt.show() 

## Linear regression model

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

df = filtered_df.copy()

pipe = Pipeline([
    ("imputer", IterativeImputer(max_iter=10, random_state=0)),  # iterative imputation
    ("scaler", StandardScaler()),                                # standardization
    ("regressor", LinearRegression())                            # linear regression
])

for column in target_column_list:
    print(f"\n==== Processing target column: {column} ====")
    # Drop rows where current target is missing
    df_col = df.dropna(subset=[column])
    if df_col.empty:
        print(f"Skipping {column}: no data after dropna.")
        continue

    # Feature/target split
    X = df_col.drop(columns=[column, "time"])
    y = df_col[column]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Fit model
    pipe.fit(X_train, y_train)

    # Predictions
    predictions = pipe.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    rmse = np.sqrt(mse)

    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R² Score:", r2)

    # Create figure and axes
    fig = plt.figure(figsize=(10, 8))
    grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.5)

    main_ax = fig.add_subplot(grid[1:4, 0:3])
    x_hist = fig.add_subplot(grid[0, 0:3], sharex=main_ax)
    y_hist = fig.add_subplot(grid[1:4, 3], sharey=main_ax)

    # Scatter plot with R²
    main_ax.scatter(y_test, predictions, alpha=0.6)
    main_ax.set_xlabel('Original Data')
    main_ax.set_ylabel('Predicted Data')
    main_ax.set_title(f'R² Plot (R² = {r2:.2f})')

    # Add y = x line
    min_val = min(min(y_test), min(predictions))
    max_val = max(max(y_test), max(predictions))
    main_ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='y = x')
    main_ax.legend()

    # Histograms
    x_hist.hist(y_test, bins=20, alpha=0.7)

    y_hist.hist(predictions, bins=20, orientation='horizontal', alpha=0.7)

    # Show the plot
    plt.show()

## Random forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

df = filtered_df.copy()

# Random Forest model
rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline([
    ("imputer", IterativeImputer(max_iter=10, random_state=0)),  # iterative imputation
    ("scaler", StandardScaler()),                                # scaling (not strictly needed for RF, but harmless)
    ("regressor", rf_model)                                      # random forest
])

for column in target_column_list:
    print(f"\n==== Processing target column: {column} ====")
    # Drop rows where current target is missing
    df_col = df.dropna(subset=[column])
    if df_col.empty:
        print(f"Skipping {column}: no data after dropna.")
        continue

    # Feature/target split
    X = df_col.drop(columns=[column, "time"])
    y = df_col[column]

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Fit model
    pipe.fit(X_train, y_train)

    # Predictions
    predictions = pipe.predict(X_test)

    # Calculate R² score
    r2 = r2_score(y_test, predictions)

    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)

    print("Mean Squared Error (MSE):", mse)
    print("Mean Absolute Error (MAE):", mae)
    rmse = np.sqrt(mse)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R² Score:", r2)

    # Create figure and axes
    fig = plt.figure(figsize=(8, 8))
    grid = plt.GridSpec(4, 4, hspace=0.5, wspace=0.5)

    main_ax = fig.add_subplot(grid[1:4, 0:3])
    x_hist = fig.add_subplot(grid[0, 0:3], sharex=main_ax)
    y_hist = fig.add_subplot(grid[1:4, 3], sharey=main_ax)

    # Scatter plot with R²
    main_ax.scatter(y_test, predictions, alpha=0.6)
    main_ax.set_xlabel('Original Data')
    main_ax.set_ylabel('Predicted Data')
    main_ax.set_title(f'R² Plot (R² = {r2:.2f})')

    # Add y = x line
    min_val = min(min(y_test), min(predictions))
    max_val = max(max(y_test), max(predictions))
    main_ax.plot([min_val, max_val], [min_val, max_val], 'r--', label='y = x')
    main_ax.legend()

    # Histograms
    x_hist.hist(y_test, bins=20, alpha=0.7)

    y_hist.hist(predictions, bins=20, orientation='horizontal', alpha=0.7)

    # Show the plot
    plt.show()