## Import necessary packages

In [None]:
import pandas as pd
!pip install xgboost
import xgboost
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

## Load dataset for XGboost

In [None]:
paths = [
    "/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/han/dataframe_engine1-1",
    "/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/han/dataframe_engine1-2",
    "/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/han/dataframe_engine1-3",
]

dfs = [pd.read_parquet(path) for path in paths]

# Concatenate into one DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Sort by time
combined_df = combined_df.sort_values(by="time").reset_index(drop=True)

In [None]:
# only the columns that actually have missing values
print(combined_df.isna().sum()[combined_df.isna().sum() == len(combined_df)])


## Applying XGBoost for feature importance

In [None]:
path = "/home/ec2-user/SageMaker/sensor-imputation-thesis/src/sensor_imputation_thesis/han/dataframe_oldid"
df = pd.read_parquet(path)

In [None]:
# only the columns that actually have missing values
print(df.isna().sum()[df.isna().sum() == len(df)])

In [None]:
# Filter the DataFrame as the engine is running
# since the XGBoost handles missing values so i do not drop nans
filtered_df = combined_df[(combined_df['fr_eng'] > (10/60))]

In [None]:
# Choose a variable as y and other variables as X
target_column_list = ['te_air_scav_rec', 'pr_baro', 'pd_air_ic__0', 'pr_exh_rec', 'pr_air_scav_ecs', 're_eng_load', 'te_seawater', 'bo_aux_blower_running', 'in_engine_running_mode']

# Define cumulative importance threshold
threshold = 0.95

for column in target_column_list:
    print(f"\nTraining for target column: {column}")

    y = filtered_df[column]

    # Check if the target column is entirely missing or empty
    if y.isna().all() or len(y.dropna()) == 0:
        print(f"Skipping {column}: no available data.")
        continue

    # Drop target column and 'time' from features
    X = filtered_df.drop(columns=[column, 'time'])

    # Split train and test size with chronological order 8:2
    train_size = int(len(filtered_df) * 0.8)
    X_train, X_test = X.iloc[:train_size], X.iloc[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]

    mask = y_train.notna() & np.isfinite(y_train)
    X_train_clean = X_train[mask]
    y_train_clean = y_train[mask]
    print(f"Dropped {len(y_train) - len(y_train_clean)} rows due to NaN or Inf in target.")

    # Define and train model
    model = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(X_train_clean, y_train_clean)

    # Get importance and normalize (already normalized to sum = 1)
    importance = model.feature_importances_

    # Create a DataFrame
    importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importance
    }).sort_values(by='Importance', ascending=False)

    # Add cumulative importance ratio
    importance_df['Cumulative'] = importance_df['Importance'].cumsum()

    # Filter to top X% of importance
    top_features_df = importance_df[importance_df['Cumulative'] <= threshold]

    print(f"\nTop features covering {threshold*100:.0f}% of total importance:")
    print(top_features_df)

    # Plot
    plt.figure(figsize=(8, 6))
    plt.barh(top_features_df['Feature'], top_features_df['Importance'])
    plt.gca().invert_yaxis()
    plt.title(f"Top {threshold*100:.0f}% Feature Importance for {column}")
    plt.xlabel("Importance Score")
    plt.tight_layout()
    plt.show()