In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [14]:
df = pd.read_csv("merged_dataset.csv")

pd.set_option('future.no_silent_downcasting', True)
for col in [f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]:
    df[col] = df[col].replace({"No": 0, "Yes": 1}).astype(int)

In [15]:
def extract_statistical_features(df):
    features = []
    grouped = df.groupby("Case#")
    
    for cid, group in grouped:
        row = {"Case#": cid}
        for p in [f"P{i}" for i in range(1, 8)]:
            row[f"{p}_mean"] = group[p].mean()
            row[f"{p}_std"] = group[p].std()
            row[f"{p}_min"] = group[p].min()
            row[f"{p}_max"] = group[p].max()
        row["Spacecraft#"] = group["Spacecraft#"].iloc[0]
        features.append(row)

    return pd.DataFrame(features)

In [16]:
def extract_labels(df):
    case_ids = df["Case#"].unique()
    label_dict = {}
    for cid in case_ids:
        d = df[df["Case#"] == cid].iloc[0]
        task1 = 0 if d["Condition"].lower() == "normal" else 1

        if task1 == 0:
            task2, task3, task4, task5 = 0, 0, 0, 100.0
        else:
            if d[[f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]].sum() > 0:
                task2 = 2
                task3 = np.argmax(d[[f"Bubble_BP{i}" for i in range(1, 8)] + ["Bubble_BV1"]].values) + 1
                task4 = 0
                task5 = 100.0
            elif d[[f"Opening_Ratio_SV{i}" for i in range(1, 5)]].min() < 100:
                task2 = 3
                sv_values = [d[f"Opening_Ratio_SV{i}"] for i in range(1, 5)]
                task4 = np.argmin(sv_values) + 1
                task5 = float(min(sv_values))
            else:
                task2, task3, task4, task5 = 1, 0, 0, 100.0

        label_dict[cid] = {
            "task1": task1,
            "task2": task2,
            "task3": task3,
            "task4": task4,
            "task5": task5
        }
    return label_dict

label_dict = extract_labels(df)
features_df = extract_statistical_features(df)

In [17]:
features_df["label"] = [label_dict[cid]["task1"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

svm1 = SVC(kernel="rbf", C=1.0)
svm1.fit(X_train, y_train)
print("[Task 1 - SVM]")
print("Accuracy:", accuracy_score(y_test, svm1.predict(X_test)))
print(classification_report(y_test, svm1.predict(X_test), zero_division=0))

xgb1 = XGBClassifier(eval_metric="logloss")
xgb1.fit(X_train, y_train)
print("[Task 1 - XGBoost]")
print("Accuracy:", accuracy_score(y_test, xgb1.predict(X_test)))
print(classification_report(y_test, xgb1.predict(X_test), zero_division=0))

[Task 1 - SVM]
Accuracy: 0.5833333333333334
              precision    recall  f1-score   support

           0       0.58      1.00      0.74        21
           1       0.00      0.00      0.00        15

    accuracy                           0.58        36
   macro avg       0.29      0.50      0.37        36
weighted avg       0.34      0.58      0.43        36

[Task 1 - XGBoost]
Accuracy: 0.9444444444444444
              precision    recall  f1-score   support

           0       0.91      1.00      0.95        21
           1       1.00      0.87      0.93        15

    accuracy                           0.94        36
   macro avg       0.96      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36



In [18]:
from sklearn.preprocessing import LabelEncoder

features_df["label"] = [label_dict[cid]["task2"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# SVM
svm2 = SVC(kernel="rbf", C=1.0)
svm2.fit(X_train, y_train)
print("[Task 2 - SVM]")
print("Accuracy:", accuracy_score(y_test, svm2.predict(X_test)))
print(classification_report(y_test, svm2.predict(X_test), zero_division=0))

# XGBoost
xgb2 = XGBClassifier(eval_metric="mlogloss")
xgb2.fit(X_train, y_train)
print("[Task 2 - XGBoost]")
print("Accuracy:", accuracy_score(y_test, xgb2.predict(X_test)))
print(classification_report(y_test, xgb2.predict(X_test), zero_division=0))

[Task 2 - SVM]
Accuracy: 0.5833333333333334
              precision    recall  f1-score   support

           0       0.58      1.00      0.74        21
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00        10

    accuracy                           0.58        36
   macro avg       0.19      0.33      0.25        36
weighted avg       0.34      0.58      0.43        36

[Task 2 - XGBoost]
Accuracy: 0.9444444444444444
              precision    recall  f1-score   support

           0       0.95      0.95      0.95        21
           1       1.00      1.00      1.00         5
           2       0.90      0.90      0.90        10

    accuracy                           0.94        36
   macro avg       0.95      0.95      0.95        36
weighted avg       0.94      0.94      0.94        36



In [19]:
features_df["label"] = [label_dict[cid]["task3"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

svm3 = SVC(kernel="rbf", C=1.0)
svm3.fit(X_train, y_train)
print("[Task 3 - SVM]")
print("Accuracy:", accuracy_score(y_test, svm3.predict(X_test)))
print(classification_report(y_test, svm3.predict(X_test), zero_division=0))

xgb3 = XGBClassifier(eval_metric="mlogloss", num_class=9)
xgb3.fit(X_train, y_train)
print("[Task 3 - XGBoost]")
print("Accuracy:", accuracy_score(y_test, xgb3.predict(X_test)))
print(classification_report(y_test, xgb3.predict(X_test), zero_division=0))


[Task 3 - SVM]
Accuracy: 0.8611111111111112
              precision    recall  f1-score   support

           0       0.86      1.00      0.93        31
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1

    accuracy                           0.86        36
   macro avg       0.14      0.17      0.15        36
weighted avg       0.74      0.86      0.80        36

[Task 3 - XGBoost]
Accuracy: 0.9722222222222222
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       0.00      0.00      0.00         0
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5       1.00      1.00      1.00         1
           7       1.00      1.00      1

In [20]:
features_df["label"] = [label_dict[cid]["task4"] for cid in features_df["Case#"]]
X = features_df.drop(columns=["Case#", "label"])
y = features_df["label"].astype(int).values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

svm4 = SVC(kernel="rbf", C=1.0)
svm4.fit(X_train, y_train)
print("[Task 4 - SVM]")
print("Accuracy:", accuracy_score(y_test, svm4.predict(X_test)))
print(classification_report(y_test, svm4.predict(X_test), zero_division=0))

xgb4 = XGBClassifier(eval_metric="mlogloss", num_class=5)
xgb4.fit(X_train, y_train)
print("[Task 4 - XGBoost]")
print("Accuracy:", accuracy_score(y_test, xgb4.predict(X_test)))
print(classification_report(y_test, xgb4.predict(X_test), zero_division=0))


[Task 4 - SVM]
Accuracy: 0.7222222222222222
              precision    recall  f1-score   support

           0       0.72      1.00      0.84        26
           1       0.00      0.00      0.00         3
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         2

    accuracy                           0.72        36
   macro avg       0.14      0.20      0.17        36
weighted avg       0.52      0.72      0.61        36

[Task 4 - XGBoost]
Accuracy: 0.9444444444444444
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        26
           1       1.00      1.00      1.00         3
           2       1.00      1.00      1.00         2
           3       1.00      0.67      0.80         3
           4       1.00      0.50      0.67         2

    accuracy                           0.94        36
   macro avg       0.99      0.83      

## **Test set**

In [21]:
test_df = pd.read_csv("merged_test_dataset.csv")

In [22]:
test_df.head()

Unnamed: 0,TIME,P1,P2,P3,P4,P5,P6,P7,Case#,Spacecraft#
0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,178,1
1,0.001,2.0,2.0,2.0,2.0,2.0,2.0,2.0,178,1
2,0.002,2.0,2.0,2.0,2.0,2.0,2.0,2.0,178,1
3,0.003,2.0,2.0,2.0,2.0,2.0,2.0,2.0,178,1
4,0.004,2.0,2.0,2.0,2.0,2.0,2.0,2.0,178,1


In [47]:
features_df = extract_statistical_features(test_df)

cid_list = features_df["Case#"].values
X_test = features_df.drop(columns=["Case#"])
results = []

# 提前统一预测
pred_task1 = xgb1.predict(X_test)
pred_task2 = xgb2.predict(X_test)
pred_task3 = xgb3.predict(X_test)
pred_task4 = xgb4.predict(X_test)

for i, cid in enumerate(cid_list):
    t1 = int(pred_task1[i])
    t2 = int(pred_task2[i])
    t3 = int(pred_task3[i])
    t4 = int(pred_task4[i])

    if t1 == 0:
        result = {
            "Case#": cid,
            "Task1": 0,
            "Task2": 0,
            "Task3": 0,
            "Task4": 0,
            "Task5": 100.0
        }
    elif t2 == 1:  # Bubble
        result = {
            "Case#": cid,
            "Task1": t1,
            "Task2": 2,
            "Task3": t3,
            "Task4": 0,
            "Task5": 100.0
        }
    elif t2 == 2:  # Valve Fault
        result = {
            "Case#": cid,
            "Task1": t1,
            "Task2": 1,
            "Task3": 0,
            "Task4": t4,
            "Task5": 100.0
        }
    else:
        result = {
            "Case#": cid,
            "Task1": t1,
            "Task2": t2,
            "Task3": 0,
            "Task4": 0,
            "Task5": 100.0
        }

    results.append(result)


result_df = pd.DataFrame(results, columns=["Case#", "Task1", "Task2", "Task3", "Task4", "Task5"])
pd.set_option('display.max_rows', None)
result_df = result_df.reset_index(drop=True)
result_df.to_csv("xgb_test_predictions.csv", index=False)
print(result_df)

    Case#  Task1  Task2  Task3  Task4  Task5
0     178      1      2      2      0  100.0
1     179      1      1      0      2  100.0
2     180      0      0      0      0  100.0
3     181      0      0      0      0  100.0
4     182      0      0      0      0  100.0
5     183      0      0      0      0  100.0
6     184      1      1      0      1  100.0
7     185      0      0      0      0  100.0
8     186      1      2      6      0  100.0
9     187      0      0      0      0  100.0
10    188      1      2      3      0  100.0
11    189      0      0      0      0  100.0
12    190      1      1      0      3  100.0
13    191      0      0      0      0  100.0
14    192      1      1      0      1  100.0
15    193      1      2      1      0  100.0
16    194      0      0      0      0  100.0
17    195      0      0      0      0  100.0
18    196      1      2      4      0  100.0
19    197      1      2      7      0  100.0
20    198      0      0      0      0  100.0
21    199 