In [1]:
# Task5 testset prediction 
# The XGB regression model with the best prediction was selected for prediction

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, XGBClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import joblib

In [2]:
# Define feature extraction function
def extract_statistical_features(df):
    features = []
    grouped = df.groupby("Case#")
    
    for cid, group in grouped:
        row = {"Case#": cid}
        for p in [f"P{i}" for i in range(1, 8)]:
            row[f"{p}_mean"] = group[p].mean()
            row[f"{p}_std"] = group[p].std()
            row[f"{p}_min"] = group[p].min()
            row[f"{p}_max"] = group[p].max()
        row["Spacecraft#"] = group["Spacecraft#"].iloc[0]
        features.append(row)

    return pd.DataFrame(features)

In [3]:
# Define label extraction function
def extract_labels(df):
    case_ids = df["Case#"].unique()
    label_dict = {}
    for cid in case_ids:
        d = df[df["Case#"] == cid].iloc[0]
        task1 = 0 if d["Condition"].lower() == "normal" else 1

        if task1 == 0:
            task2, task3, task4, task5 = 0, 0, 0, 100.0
        else:
            if d[[f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]].sum() > 0:
                task2 = 2
                task3 = np.argmax(d[[f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]].values) + 1
                task4 = 0
                task5 = 100.0
            elif d[[f"Opening_Ratio_SV{i}" for i in range(1, 5)]].min() < 100:
                task2 = 3
                sv_values = [d[f"Opening_Ratio_SV{i}"] for i in range(1, 5)]
                task4 = np.argmin(sv_values) + 1
                task5 = float(min(sv_values))
            else:
                task2, task3, task4, task5 = 1, 0, 0, 100.0

        label_dict[cid] = {
            "task1": task1,
            "task2": task2,
            "task3": task3,
            "task4": task4,
            "task5": task5
        }
    return label_dict


In [4]:
# 1. Load and process training data
print("Training data is being loaded and processed...")
df = pd.read_csv("merged_dataset.csv")

# Converting Yes/No to 1/0
for col in [f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]:
    df[col] = df[col].replace({"No": 0, "Yes": 1}).astype(int)

# Extract labels and features
label_dict = extract_labels(df)
features_df = extract_statistical_features(df)

Training data is being loaded and processed...


In [5]:
# 2. Training a classification model for Tasks 1-4
print("The classification model is being trained....")

# Task 1: fault detection
features_df["label"] = [label_dict[cid]["task1"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

xgb1 = XGBClassifier(eval_metric="logloss")
xgb1.fit(X_train1, y_train1)
print("[task 1 - XGBoost]")
print("accuracy:", accuracy_score(y_test1, xgb1.predict(X_test1)))
print(classification_report(y_test1, xgb1.predict(X_test1), zero_division=0))

The classification model is being trained....
[task 1 - XGBoost]
accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           0       0.88      1.00      0.93        21
           1       1.00      0.80      0.89        15

    accuracy                           0.92        36
   macro avg       0.94      0.90      0.91        36
weighted avg       0.93      0.92      0.91        36



In [6]:
# Task2: Type of fault
from sklearn.preprocessing import LabelEncoder

features_df["label"] = [label_dict[cid]["task2"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

print("Task 2 Raw Labeled Values:", np.unique(y))

# use LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print("Task 2 Encoded Tagged Values:", np.unique(y))

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

xgb2 = XGBClassifier(eval_metric="mlogloss")
xgb2.fit(X_train2, y_train2)
print("[Task 2 - XGBoost]")
print("accuracy:", accuracy_score(y_test2, xgb2.predict(X_test2)))
print(classification_report(y_test2, xgb2.predict(X_test2), zero_division=0))

# Save mapping relationships for use in forecasting
joblib.dump(le, 'task2_label_encoder.joblib')

Task 2 Raw Labeled Values: [0 2 3]
Task 2 Encoded Tagged Values: [0 1 2]
[Task 2 - XGBoost]
accuracy: 0.9444444444444444
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        21
           1       1.00      1.00      1.00         5
           2       0.90      0.90      0.90        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.94      0.94      0.94        36



['task2_label_encoder.joblib']

In [8]:
# Tasl3: Bubble position
features_df["label"] = [label_dict[cid]["task3"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# Save mapping relationships for use in forecasting
xgb3 = XGBClassifier(eval_metric="mlogloss", num_class=9)
xgb3.fit(X_train3, y_train3)
print("[Task 3 - XGBoost]")
print("accuracy:", accuracy_score(y_test3, xgb3.predict(X_test3)))
print(classification_report(y_test3, xgb3.predict(X_test3), zero_division=0))


[Task 3 - XGBoost]
accuracy: 0.9722222222222222
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       0.00      0.00      0.00         0
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       0.00      0.00      0.00         1

    accuracy                           0.97        36
   macro avg       0.71      0.71      0.71        36
weighted avg       0.97      0.97      0.97        36



In [9]:
# Task4: Faulty valves
features_df["label"] = [label_dict[cid]["task4"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

xgb4 = XGBClassifier(eval_metric="mlogloss", num_class=5)
xgb4.fit(X_train4, y_train4)
print("[Task 4 - XGBoost]")
print("accuracy:", accuracy_score(y_test4, xgb4.predict(X_test4)))
print(classification_report(y_test4, xgb4.predict(X_test4), zero_division=0))


[Task 4 - XGBoost]
accuracy: 0.9444444444444444
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        26
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         2
           3       1.00      0.67      0.80         3
           4       1.00      0.50      0.67         2

    accuracy                           0.94        36
   macro avg       0.99      0.83      0.89        36
weighted avg       0.95      0.94      0.94        36



In [11]:
# 3. Training regression models for task 5
print("\nThe regression model is being trained...")

# Filtering out valve fault data
features_df["task2"] = [label_dict[cid]["task2"] for cid in features_df["Case#"]]
features_df["task5"] = [label_dict[cid]["task5"] for cid in features_df["Case#"]]

# Training valve opening regression models (using only valve failure cases)
sv_fault_df = features_df[features_df["task2"] == 3].copy()

if len(sv_fault_df) > 0:
    X_sv = sv_fault_df.drop(columns=["Case#", "task2", "task5", "label"])
    y_sv = sv_fault_df["task5"].values
    
    # Training-Test Set Splitting
    X_train5, X_test5, y_train5, y_test5 = train_test_split(
        X_sv, y_sv, test_size=0.2, random_state=42
    )
    
    # Using the XGBoost regression model
    xgb_regressor = XGBRegressor(
        objective='reg:squarederror',
        n_estimators=100,
        learning_rate=0.1,
        max_depth=4
    )
    
    xgb_regressor.fit(X_train5, y_train5)
    
    # assessment model
    y_pred5 = xgb_regressor.predict(X_test5)
    
    # Calculation of assessment indicators
    mse = mean_squared_error(y_test5, y_pred5)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test5, y_pred5)
    r2 = r2_score(y_test5, y_pred5)
    
    print("[Task 5 - XGBoost regression model]")
    print(f" (MSE): {mse:.4f}")
    print(f" (RMSE): {rmse:.4f}")
    print(f" (MAE): {mae:.4f}")
    print(f" (R²): {r2:.4f}")
    print(y_train5)
    print(y_test5)
    print(y_pred5)


The regression model is being trained...
[Task 5 - XGBoost regression model]
 (MSE): 48.3476
 (RMSE): 6.9532
 (MAE): 4.5617
 (R²): 0.9520
[ 0. 75. 50. 75. 25. 25. 25. 25. 75. 25.  0. 25.  0. 50.  0. 75. 50. 25.
 75. 50. 25.  0. 25. 50.  0. 75. 75. 25. 50. 50. 50. 75.  0. 75. 50. 50.
  0. 50.]
[75.  0. 50. 75.  0. 25.  0. 75.  0. 25.]
[ 6.3326771e+01  1.2763156e+00  5.1074196e+01  7.5010635e+01
  1.1311181e+00  3.1071983e+01 -1.9518977e-02  6.0595741e+01
 -1.5686054e-02  3.4939659e+01]


In [12]:
if xgb_regressor is not None:
    joblib.dump(xgb_regressor, 'xgb_valve_opening_regressor.joblib')
    print("Task 5 regression models have been saved as 'xgb_valve_opening_regressor.joblib'")
else:
    print("There is not enough valve failure data to train the regression model型")
    

Task 5 regression models have been saved as 'xgb_valve_opening_regressor.joblib'


In [13]:
# 5. Reading and processing test data sets
print("\nTesting data being loaded and processed...")
test_df = pd.read_csv("merged_test_dataset.csv")

# Test data preprocessing
for col in [f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]:
    if col in test_df.columns:
        test_df[col] = test_df[col].replace({"No": 0, "Yes": 1}).astype(int)

# Extract test set features
features_df_test = extract_statistical_features(test_df)

# Extracting the list of test set case IDs
cid_list = features_df_test["Case#"].values
X_test = features_df_test.drop(columns=["Case#"])


Testing data being loaded and processed...


In [14]:
# 6. Prediction on the test set

# Pre-existing predictions for tasks 1-4
pred_task1 = xgb1.predict(X_test)
pred_task2 = xgb2.predict(X_test)
pred_task3 = xgb3.predict(X_test)
pred_task4 = xgb4.predict(X_test)

# Preparing a list for storing the final prediction results
results = []

# Predictions for each test case
for i, cid in enumerate(cid_list):
    if i >= len(pred_task1):  # Ensure that the index does not exceed the size of the predicted result
        print(f"Warning: index {i} is out of range {len(pred_task1)}")
        break
        
    t1 = int(pred_task1[i])
    t2 = int(pred_task2[i])
    t3 = int(pred_task3[i])
    t4 = int(pred_task4[i])
    t5 = 100.0  
    
    # Adjustment of forecasts based on logical relationships between tasks
    if t1 == 0:  # normal condition
        t2, t3, t4, t5 = 0, 0, 0, 100.0
    else:  # abnormal state
        if t2 == 1:  # malfunction of air bubble
            # Keep the bubble position predicted and valve related set to normal
            t4 = 0
            t5 = 100.0
        elif t2 == 2:  # Valve Failure
            t3 = 0
            try:
                valve_features = X_test.iloc[i:i+1]
                t5 = float(xgb_regressor.predict(valve_features)[0])
                t5 = max(0.0, min(t5, 99.9))  
            except Exception as e:
                print(f"Case {cid} Valve Opening Prediction Failed: {str(e)}，Use default values")
                t5 = 50.0  # If the prediction fails, use the default value
    
        else: # Other faults
            t3 = 0
            t4 = 0
            t5 = 100.0
            # Predicting Valve Openings Using the Regression Model from Task 5

    # Fixed: add predictions to results list on every loop
    results.append({
        "Case#": cid,
        "Task1": t1,
        "Task2": t2,
        "Task3": t3,
        "Task4": t4,
        "Task5": t5
    })

# Convert results to DataFrame
result_df = pd.DataFrame(results)

# 7. Display and save forecast results
print(f"\nTotal predicted {len(result_df)} test cases")
print("\nPreview of the test set prediction results:")
print(result_df.head(46)) 

# Save forecast results
result_df.to_csv("xgb_complete_test_predictions.csv", index=False)
print("All predictions have been saved to  'xgb_complete_test_predictions.csv'")


Total predicted 46 test cases

Preview of the test set prediction results:
    Case#  Task1  Task2  Task3  Task4       Task5
0     178      1      1      2      0  100.000000
1     179      1      2      0      2   25.016947
2     180      0      0      0      0  100.000000
3     181      0      0      0      0  100.000000
4     182      0      0      0      0  100.000000
5     183      0      0      0      0  100.000000
6     184      1      2      0      1    4.428387
7     185      0      0      0      0  100.000000
8     186      1      1      6      0  100.000000
9     187      0      0      0      0  100.000000
10    188      1      1      3      0  100.000000
11    189      0      0      0      0  100.000000
12    190      1      2      0      3   50.733398
13    191      0      0      0      0  100.000000
14    192      1      2      0      1    4.502021
15    193      1      1      1      0  100.000000
16    194      0      0      0      0  100.000000
17    195      0      0 

In [15]:
# 8. Analysis of projected results
print("\nStatistics on projected results:")
print(f"Total cases: {len(result_df)}")
print(f"Normal cases (Task1=0): {sum(result_df['Task1'] == 0)}")
print(f"Abnormal Cases (Task1=1): {sum(result_df['Task1'] == 1)}")
print(f"where the air bubble malfunctions (Task2=2): {sum(result_df['Task2'] == 1)}")
print(f"where the valve fails (Task2=3): {sum(result_df['Task2'] == 2)}")


Statistics on projected results:
Total cases: 46
Normal cases (Task1=0): 23
Abnormal Cases (Task1=1): 23
where the air bubble malfunctions (Task2=2): 11
where the valve fails (Task2=3): 12


In [17]:
# Openness analysis for valve failure cases
valve_fault_cases = result_df[result_df['Task2'] == 2]
if len(valve_fault_cases) > 0:
    print("\nValve Failure Openings Prediction Analysis:")
    print(f"Number of valve failure cases: {len(valve_fault_cases)}")
    print(f"Average predicted opening: {valve_fault_cases['Task5'].mean():.2f}%")
    print(f"Minimum predicted opening: {valve_fault_cases['Task5'].min():.2f}%")
    print(f"Maximum predicted opening: {valve_fault_cases['Task5'].max():.2f}%")
    
    # Distribution statistics by valve
    valve_counts = valve_fault_cases['Task4'].value_counts()
    for valve_id in range(1, 5):
        print(f"valves {valve_id}: {valve_counts.get(valve_id, 0)}cases")
else:
    print("\nValve failure cases not detected in test set")

print("\nForecasting and analysis completed!")


Valve Failure Openings Prediction Analysis:
Number of valve failure cases: 12
Average predicted opening: 28.22%
Minimum predicted opening: 4.36%
Maximum predicted opening: 73.60%
valves 1: 6cases
valves 2: 3cases
valves 3: 2cases
valves 4: 1cases

Forecasting and analysis completed!
