In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 读取 Excel 文件
excel_file = 'data/train.xlsx'

# 使用 pandas read_excel 函数读取所有工作表
xlsx = pd.ExcelFile(excel_file)

# 存储所有工作表的数据
data_frames = []

# 遍历所有工作表
for sheet_name in xlsx.sheet_names:
    data = pd.read_excel(xlsx, sheet_name=sheet_name)
    data_frames.append(data)

# 合并所有工作表的数据
X_raw = pd.concat(data_frames, ignore_index=True)

# 确保所有列名都是字符串类型
X_raw.columns = X_raw.columns.astype(str)

# 数据预处理：提取磁通密度（1024个采样点），并进行标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw.iloc[:, 4:])  # 假设磁通密度数据从第五列开始

# 提取波形标签
y = X_raw['励磁波形']


In [38]:
from scipy.fft import fft
import numpy as np

# 时域特征提取函数
def extract_time_domain_features(X): 
    features = pd.DataFrame() 
    features['max'] = np.max(X, axis=1) 
    features['min'] = np.min(X, axis=1) 
    features['mean'] = np.mean(X, axis=1) 
    features['std'] = np.std(X, axis=1)
    features['peak_to_peak'] = np.max(X, axis=1) - np.min(X, axis=1) 
    return features

# 频域特征提取函数
def extract_frequency_domain_features(X):
    features = pd.DataFrame()
    # 对每一行进行傅里叶变换并提取频域特征
    fft_feature = fft(X, axis=1)
    features['fft_max_freq'] = np.max(np.abs(fft_feature), axis=1)
    features['fft_energy'] = np.sum(np.abs(fft_feature)**2, axis=1)
    return features

# 提取特征
time_features = extract_time_domain_features(X_scaled)
freq_features = extract_time_domain_features(X_scaled)

# 合并所有特征
X_features = pd.concat([time_features, freq_features], axis=1)

# X_scaled = X_scaled[:, :]
print(X_features.shape)
print(X_scaled.shape)

(12400, 10)
(12400, 1023)


In [29]:
from sklearn.feature_selection import SelectKBest, f_classif

selector = SelectKBest(f_classif, k=10)
X_selected = selector.fit_transform(X_features, y)

selected_features = X_features.columns[selector.get_support()]
print(f"Selected features: {selected_features}")


ValueError: Input X contains NaN.
SelectKBest does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# 标签编码
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# 将Pandas DataFrame转换为DMatrix，这是XGBoost的专用数据结构
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dtest = xgb.DMatrix(X_test, label=y_test_encoded)

# 设置XGBoost参数
param = {
    'max_depth': 3,  # 树的最大深度
    'eta': 0.3,  # 学习率
    'objective': 'multi:softmax',  # 多分类问题
    'num_class': 3,  # 类别数量，根据你的问题调整
    'eval_metric': 'mlogloss'  # 多分类的对数损失
}

# 训练模型
num_round = 100  # 训练轮数
bst = xgb.train(param, dtrain, num_round)

# 预测
preds = bst.predict(dtest)

# 评估模型
accuracy = accuracy_score(y_test_encoded, preds)
print(f"Accuracy: {accuracy}")

# K折交叉验证

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# 假设你的数据已经加载到Pandas DataFrame中，X_scaled 是特征数据，y 是标签（励磁波形）

# 标签编码
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # 整体编码

# 将Pandas DataFrame转换为DMatrix，这是XGBoost的专用数据结构
dtrain = xgb.DMatrix(X_scaled, label=y_encoded)

# 设置XGBoost参数
param = {
    'max_depth': 3,  # 树的最大深度
    'eta': 0.3,  # 学习率
    'objective': 'multi:softmax',  # 多分类问题
    'num_class': 3,  # 类别数量，根据你的问题调整
    'eval_metric': 'mlogloss'  # 多分类的对数损失
}

# 设置K折交叉验证参数
num_round = 100  # 训练轮数
nfold = 5  # K折交叉验证中的K值
early_stopping_rounds = 10  # 提前停止

# 使用交叉验证
cv_results = xgb.cv(
    param,
    dtrain,
    num_boost_round=num_round,
    nfold=nfold,
    metrics={'mlogloss'},
    early_stopping_rounds=early_stopping_rounds,
    seed=42
)

# 打印交叉验证的结果
print(cv_results)

# 获取最佳轮数
best_num_rounds = cv_results.shape[0]
print(f"Best number of rounds: {best_num_rounds}")

# 使用最佳轮数重新训练模型
bst = xgb.train(param, dtrain, best_num_rounds)

# 如果要评估模型的表现，可以再次预测并评估在整个数据集上的结果
preds = bst.predict(dtrain)
accuracy = accuracy_score(y_encoded, preds)
print(f"Accuracy: {accuracy}")


# 混淆矩阵绘制

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
import matplotlib.pyplot as plt

# 假设你的数据已经加载到Pandas DataFrame中，X_scaled 是特征数据，y 是标签（励磁波形）

# 标签编码
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# 将训练和测试数据转换为XGBoost的DMatrix格式
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# 设置XGBoost参数
param = {
    'max_depth': 3,
    'eta': 0.3,
    'objective': 'multi:softmax',
    'num_class': 3,  # 根据类别数量调整
    'eval_metric': 'mlogloss'
}

# 训练模型
num_round = 100
bst = xgb.train(param, dtrain, num_round)

# 使用测试集进行预测
y_pred = bst.predict(dtest)

# 计算混淆矩阵
cm = confusion_matrix(y_test, y_pred)

# 绘制混淆矩阵
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.show()

# 计算并打印准确率
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")


In [None]:
# 查看编码后的0, 1, 2对应的原始标签
label_mapping = dict(enumerate(label_encoder.classes_))
print(label_mapping)

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

# 加载新的数据文件
new_data = pd.read_csv('data/附件二（测试集）.csv')  # 请替换成你的新文件路径

# 过滤其它
filtered_data = new_data[new_data['磁芯材料'] == '材料1']


# 对新数据进行相同的预处理（假设我们只需要磁通密度的相关特征）
X_new = filtered_data.iloc[:, 4:]  # 根据需要调整列的范围

# 将新数据转换为DMatrix格式
dnew = xgb.DMatrix(X_new)

# 使用已经训练好的模型进行预测
y_new_pred = bst.predict(dnew)

# 检查预测结果长度和过滤后的数据长度是否一致
if len(y_new_pred) != len(filtered_data):
    raise ValueError(f"预测结果长度 ({len(y_new_pred)}) 与数据长度 ({len(filtered_data)}) 不一致")

# 将预测结果解码回原始标签
y_new_pred_labels = label_encoder.inverse_transform(y_new_pred.astype(int))

# 将预测结果添加到新数据中并保存
filtered_data['label'] = y_new_pred_labels
filtered_data.to_csv('new_data_with_predictions.csv', index=False)

print("预测完成，结果已保存到 'new_data_with_predictions.csv'")
