In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
df = pd.read_excel('FF_2020data.xlsx')
# 划分特征和目标变量
columns_to_drop = ['Unnamed: 0','aqi_CO', 'aqi_FSP', 'aqi_NO2', 'aqi_O3', 'aqi_RSP', 'aqi_SO2', 'aqi']
df= df.drop(columns_to_drop, axis=1)
X = df.drop(['空气质量指数'], axis=1)
y = df['空气质量指数']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=df['空气质量指数'])
df.head()

Unnamed: 0.1,Unnamed: 0,CO,FSP,NO2,O3,RSP,SO2,空气质量指数
0,0,0.4,18,30,47,26,6,2
1,1,0.4,16,36,39,24,7,2
2,2,0.4,14,37,36,21,6,2
3,3,0.3,12,30,43,18,6,2
4,4,0.3,10,32,43,16,6,2


In [6]:
print(df.dtypes)

Unnamed: 0      int64
CO            float64
FSP             int64
NO2             int64
O3              int64
RSP             int64
SO2             int64
空气质量指数          int64
dtype: object


In [8]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score

# XGBoost 模型参数
params_xgb = {
    'booster': 'gbtree',
    'verbosity': 1,
    'seed': 42,
    'nthread': -1,
    'colsample_bytree': 0.6,
    'subsample': 0.7,
    'eval_metric': 'logloss'
}

# 初始化 XGBoost 分类模型
model_xgb = xgb.XGBClassifier(**params_xgb)

# 定义参数网格，调整范围
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.02]
}

# 创建分层交叉验证对象，减少折数
skf = StratifiedKFold(n_splits=3)

# 使用 GridSearchCV 进行网格搜索和分层交叉验证
grid_search = GridSearchCV(
    estimator=model_xgb,
    param_grid=param_grid,
    scoring=['neg_log_loss', 'accuracy', 'recall', 'f1'],
    cv=skf,
    n_jobs=-1,
    verbose=1,
    refit='accuracy'  # 指定准确率作为重新拟合的指标
)

# 假设你已经有了 X_train 和 y_train
# 训练模型
grid_search.fit(X_train, y_train)

# 输出最优参数
print("Best parameters found: ", grid_search.best_params_)
print("Best Log Loss score: ", -grid_search.best_score_)

# 使用最优参数训练模型
best_model = grid_search.best_estimator_

# 预测
y_pred = best_model.predict(X_test)

# 手动处理可能出现的分母为 0 的情况
def custom_precision_score(y_true, y_pred):
    true_positives = ((y_pred == 1) & (y_true == 1)).sum()
    predicted_positives = (y_pred == 1).sum()
    return true_positives / predicted_positives if predicted_positives > 0 else 0

precision = custom_precision_score(y_test, y_pred)
print("Custom Precision Score:", precision)

Fitting 3 folds for each of 18 candidates, totalling 54 fits




Best parameters found:  {'learning_rate': 0.02, 'max_depth': 5, 'n_estimators': 300}
Best Log Loss score:  -0.8977254131893307
Custom Precision Score: 0.9387755102040817


In [9]:
from sklearn.metrics import classification_report
# 预测测试集
y_pred = best_model.predict(X_test)
# 输出模型报告， 查看评价指标
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.94      0.68      0.79        68
           2       0.90      0.93      0.91       442
           3       0.89      0.92      0.90       495
           4       0.90      0.92      0.91       412
           5       0.94      0.77      0.85       110

    accuracy                           0.90      1528
   macro avg       0.76      0.70      0.73      1528
weighted avg       0.90      0.90      0.90      1528



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
from sklearn.metrics import classification_report
# 预测训练集
y_train_pred = best_model.predict(X_train)
# 输出模型报告， 查看评价指标
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       0.98      0.87      0.92       272
           2       0.95      0.97      0.96      1770
           3       0.94      0.96      0.95      1978
           4       0.97      0.96      0.97      1646
           5       1.00      0.96      0.98       440

    accuracy                           0.96      6111
   macro avg       0.97      0.85      0.89      6111
weighted avg       0.96      0.96      0.96      6111



In [11]:
# 评估模型在训练集上的性能
print("训练集准确率：", best_model.score(X_train, y_train))
print("测试集准确率：", best_model.score(X_test, y_test))

训练集准确率： 0.9574537718867616
测试集准确率： 0.899869109947644


In [12]:
import joblib
# 保存模型
joblib.dump(best_model , 'XGBoost2020.pkl')

['XGBoost2020.pkl']