In [1]:
from data_loader import DataLoader
from feature_extractor import FeatureExtractor
import pandas as pd 
import numpy as np
import scipy as sp
import time
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import  KFold,cross_val_score,cross_validate

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_sample_weight


class ModelTrainer:
    def __init__(self):
        """
        初始化模型训练器（XGBoost专用版）
        """
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoder = LabelEncoder()  # 新增标签编码器
        self.classes_ = None  # 保存原始类别标签

    def prepare_data(self, features, labels):
        """
        准备训练数据（集成标签编码）
        """
        # 编码标签为0开始的整数
        y_encoded = self.label_encoder.fit_transform(labels)
        self.classes_ = self.label_encoder.classes_  # 保存原始标签

        # 划分训练集和测试集（分层抽样使用编码后标签）
        X_train, X_test, y_train, y_test = train_test_split(
            features, y_encoded,
            test_size=0.2,
            random_state=42,
            stratify=y_encoded
        )
        # 标准化特征
        X_train = self.scaler.fit_transform(X_train)
        X_test = self.scaler.transform(X_test)

        return X_train, X_test, y_train, y_test

    def build_model(self):
        """
        构建优化版XGBoost模型（新增早停相关参数）
        """
        self.model = XGBClassifier(
            n_estimators=1000,  # 设置更大的树数量以支持早停
            max_depth=5,  # 降低树深度
            learning_rate=0.1,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.2,
            reg_lambda=1.0,
            objective='multi:softmax',  # 使用softmax更直观
            num_class=len(self.classes_),
            tree_method='hist',
            eval_metric='mlogloss',  # 明确评估指标
            early_stopping_rounds=50,  # 新增早停参数
            n_jobs=-1,
            random_state=42,
        )

    def train(self, X_train, y_train):
        """
        训练XGBoost模型（集成早停和评估）
        """
        # 分割验证集
        X_train, X_val, y_train, y_val = train_test_split(
            X_train, y_train,
            test_size=0.1,
            random_state=42
        )

        # 计算样本权重
        sample_weights = compute_sample_weight('balanced', y_train)

        # 训练模型（带早停）
        self.model.fit(
            X_train, y_train,
            sample_weight=sample_weights,
            eval_set=[(X_val, y_val)],
            verbose=50  # 每50轮打印进度
        )

        # 训练结果分析
        print("\nBest iteration:", self.model.best_iteration)
        print("Best validation score: {:.3f}".format(self.model.best_score))



    def evaluate(self, X_test, y_test):
        """
        评估模型（支持原始标签显示）
        """
        # 转换预测结果到原始标签
        y_pred = self.model.predict(X_test)
        y_test_decoded = self.label_encoder.inverse_transform(y_test)
        y_pred_decoded = self.label_encoder.inverse_transform(y_pred)

        # 生成分类报告
        class_report = classification_report(
            y_test_decoded, y_pred_decoded,
            target_names=self.classes_.astype(str)
        )
        print("\nClassification Report:")
        print(class_report)

        # 绘制混淆矩阵（原始标签）
        conf_matrix = confusion_matrix(y_test_decoded, y_pred_decoded)
        plt.figure(figsize=(12, 10))
        sns.heatmap(
            conf_matrix,
            annot=True,
            fmt='d',
            cmap='Blues',
            xticklabels=self.classes_,
            yticklabels=self.classes_
        )
        plt.title('Confusion Matrix - XGBoost')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.savefig('xgboost_confusion_matrix.png')
        plt.close()

        # 特征重要性分析
        feature_importance = pd.DataFrame({
            'feature': [f'feature_{i}' for i in range(X_test.shape[1])],
            'importance': self.model.feature_importances_
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(12, 6))
        sns.barplot(x='importance', y='feature', data=feature_importance.head(15))
        plt.title('Top 15 Important Features (XGBoost)')
        plt.savefig('xgboost_feature_importance.png')
        plt.close()

        return class_report

    def save_model(self, model_path='xgboost_model.pkl',
                   scaler_path='scaler.pkl',
                   encoder_path='label_encoder.pkl'):
        """
        保存完整模型体系
        """
        joblib.dump(self.model, model_path)
        joblib.dump(self.scaler, scaler_path)
        joblib.dump(self.label_encoder, encoder_path)
        print(f"Model artifacts saved to {model_path}, {scaler_path}, {encoder_path}")

    def load_model(self, model_path='xgboost_model.pkl',
                   scaler_path='scaler_XGboost.pkl',
                   encoder_path='label_encoder_XGboost.pkl'):
        """
        加载完整模型体系
        """
        self.model = joblib.load(model_path)
        self.scaler = joblib.load(scaler_path)
        self.label_encoder = joblib.load(encoder_path)
        self.classes_ = self.label_encoder.classes_

In [3]:
# 初始化数据加载器
data_loader = DataLoader(
    base_dir='activity_segments',
    activity_file='TrainActivities.csv'
)

# 加载数据
print("Loading accelerometer data...")
data = data_loader.load_data()

# 特征提取
print("\nExtracting features...")
feature_extractor = FeatureExtractor(window_size=64, overlap=0.5)
segments, labels = feature_extractor.segment_data(data)

print(f"\nFeature extraction:")
# 从所有段中提取特征
features = []
for i, segment in enumerate(segments):
    if i % 100 == 0:  # 每处理100个片段打印一次进度
        print(f"Processing segment {i+1}/{len(segments)}")
    segment_features = feature_extractor.extract_features(segment)
    features.append(list(segment_features.values()))

# 转换为numpy数组
features = np.array(features)
print(f"\nFeatures shape: {features.shape}")


Valid activity types from TrainActivities.csv:
ID 2806: 1 (FACING camera) Sit and stand
ID 2807: 2 (FACING camera) both hands SHAKING (sitting position)
ID 2808: 3 Stand up from chair - both hands with SHAKING
ID 2809: 4 (Sideway) Sit & stand
ID 2810: 5 (Sideway) both hands SHAKING (sitting)
ID 2811: 6 (Sideway) STAND up with - both hands SHAKING
ID 2812: 7 Cool down - sitting/relax
ID 2813: 8 Walk (LEFT --> Right --> Left)
ID 2814: 9 Walk & STOP/frozen, full body shaking, rotate then return back
ID 2815: 10 Slow walk (SHAKING hands/body, tiny step, head forward)
Loading accelerometer data...

Found 75 CSV files in valid activity folders

Processing activity type 2806...
Processed 10/75 files

Processing activity type 2807...

Processing activity type 2808...
Processed 20/75 files

Processing activity type 2809...

Processing activity type 2810...
Processed 30/75 files

Processing activity type 2811...

Processing activity type 2812...
Processed 40/75 files

Processing activity type 2

In [4]:
# 训练XGBoost模型
model_trainer = ModelTrainer()
X_train, X_test, y_train, y_test = model_trainer.prepare_data(features, labels)

# 记录训练开始时间
start_time = time.time()

model_trainer.build_model()
model_trainer.train(X_train, y_train)

[0]	validation_0-mlogloss:2.20054
[50]	validation_0-mlogloss:1.21848
[100]	validation_0-mlogloss:1.09568
[150]	validation_0-mlogloss:1.04707
[200]	validation_0-mlogloss:1.02348
[250]	validation_0-mlogloss:1.01727
[300]	validation_0-mlogloss:1.01113
[350]	validation_0-mlogloss:1.00982
[368]	validation_0-mlogloss:1.01376

Best iteration: 318
Best validation score: 1.007


In [5]:
# 计算训练时间
training_time = time.time() - start_time
print(f"\nTraining Time: {training_time:.2f} seconds")

# 评估模型
print("\nModel Evaluation Results:")
evaluation_results = model_trainer.evaluate(X_test, y_test)

# 保存模型和评估结果
print("\nSaving model and results...")
model_trainer.save_model('saved_model.pkl', 'saved_scaler.pkl')

# 保存结果到文件
with open('model_results.txt', 'w') as f:
    f.write("Random Forest Model Results\n")
    f.write("==========================\n\n")
    f.write(f"Training Time: {training_time:.2f} seconds\n\n")
    f.write("Evaluation Results:\n")
    f.write(evaluation_results)



Training Time: 6.04 seconds

Model Evaluation Results:

Classification Report:
              precision    recall  f1-score   support

        2806       0.62      0.71      0.66       136
        2807       0.63      0.58      0.60        83
        2808       0.65      0.51      0.57        59
        2809       0.50      0.50      0.50        10
        2810       0.69      0.65      0.67        37
        2811       0.71      0.47      0.57        57
        2812       0.59      0.57      0.58        72
        2813       0.59      0.60      0.60        60
        2814       0.58      0.68      0.63       177
        2815       0.69      0.67      0.68       173

    accuracy                           0.63       864
   macro avg       0.63      0.59      0.61       864
weighted avg       0.63      0.63      0.63       864


Saving model and results...
Model artifacts saved to saved_model.pkl, saved_scaler.pkl, label_encoder.pkl


In [48]:
# 学习曲线
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import  KFold, GridSearchCV,HalvingGridSearchCV, cross_validate, RandomizedSearchCV
from sklearn.experimental import enable_halving_search_cv

#划分训练集和测试集
X_train, X_test, y_train, y_test = model_trainer.prepare_data(features, labels)

label_encoder = LabelEncoder()  # 新增标签编码器
# 编码标签为0开始的整数
y_encoded = label_encoder.fit_transform(labels)
classes_ = label_encoder.classes_  # 保存原始标签


In [None]:
#以学习率为例子的学习曲线图像
tr = []
te = []

# for i in np.arange(0,1,0.25):
xgb1 = XGBClassifier(n_estimators=1000,  # 设置更大的树数量以支持早停
                    max_depth=5,  # 降低树深度
                    learning_rate=0.1,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    reg_alpha=0.2,
                    reg_lambda=1.0,
                    objective='multi:softmax',  # 使用softmax更直观
                    num_class=9,
                    tree_method='hist',
                    eval_metric='mlogloss',  # 明确评估指标
                    # early_stopping_rounds=50,  # 新增早停参数
                    n_jobs=-1,
                    random_state=42,
                    )
core_te = cross_val_score(xgb1,features,y_encoded,cv=4).mean() # 4折交叉验证的准确率

# tr.append(score_tr)
# te.append(score_te)
# plt.plot(np.arange(0,1,0.1),tr,color="red",label="train")
# plt.plot(np.arange(0,1,0.1),te,color="blue",label="test")
# plt.xticks(np.arange(0,1,0.1))
# plt.legend()
# plt.show()


In [57]:
core_te

np.float64(0.2704480103696229)

In [47]:
print(tr)
print(te)

[1.0]
[np.float64(0.2636062664294138)]


array([[-0.58197031, -0.76946175, -0.9407475 , ..., -0.02736846,
        -0.44712789, -0.96073682],
       [-0.49687715, -0.31147539, -0.51269575, ...,  1.57349504,
         0.79219401,  2.52492714],
       [ 0.21717143,  0.13562872,  0.09257455, ...,  0.19386117,
        -0.34626803, -0.08932083],
       ...,
       [ 0.92274524,  1.59025409,  1.78852973, ..., -0.55054167,
        -0.56478409,  0.25924557],
       [ 1.45187823,  1.45905125,  1.86388074, ..., -0.09235759,
        -0.65049665,  0.43352877],
       [-0.40342184,  1.85768806,  0.91372136, ...,  0.08082716,
        -0.48166938,  0.43352877]], shape=(790, 33))