In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

# 加载数据
file_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/feature_engineering2015-201901_data0622.csv"
data = pd.read_csv(file_path)
date_column = "order_date"  # 日期列
    
# 转换日期列
data[date_column] = pd.to_datetime(data[date_column])
    
# 去重处理
data.drop_duplicates(inplace=True)
    
# 删除日期列
data = data.drop(columns=['order_date'])
# 删除 D 列为 1 到 60 的数据
data = data[data['D'] > 60]
data.to_csv("D:/test0626.csv", index=False)

In [None]:
def preprocess_data(data, encoder=None, fit_encoder=True):
    
    
    # 转换对象类型列为分类变量
    categorical_columns = ['season', 'month_phase']
    if fit_encoder:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoded = encoder.fit_transform(data[categorical_columns])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_columns))
        data = pd.concat([data.drop(columns=categorical_columns), encoded_df], axis=1)
    else:
        encoded = encoder.transform(data[categorical_columns])
        encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(categorical_columns))
        data = pd.concat([data.drop(columns=categorical_columns), encoded_df], axis=1)
    print(data.shape)
    # 确保 is_holiday 和 is_promo 列为布尔类型
    if 'is_holiday' in data.columns:
        data['is_holiday'] = data['is_holiday'].astype(bool)
    if 'is_promo' in data.columns:
        data['is_promo'] = data['is_promo'].astype(bool)
    # 删除日期列
    print(data.shape)
    data = data.drop(columns=['order_date'])
    
    # 删除 D 列为 1 到 60 的数据
    data = data[data['D'] > 60]
    print(data.shape)
    # 删除不需要的列
    columns_to_drop = ['Unnamed: 0', 'item_price', 'sales_chan_name']
    
    data.drop(columns=columns_to_drop, inplace=True)
    print(data.shape)
    
    # 删除 sales_region_code 为空的数据
    data = data.dropna(subset=['sales_region_code'])

    print(data.shape)
    return data, encoder

# 加载数据
file_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/feature_engineering2015-201901_data0622.csv"
data = pd.read_csv(file_path)
date_column = "order_date"  # 日期列
    
# 转换日期列
data[date_column] = pd.to_datetime(data[date_column])
    
# 去重处理
data.drop_duplicates(inplace=True)
    
# 预处理数据并保存编码器
processed_data, encoder = preprocess_data(data)
print("Preprocessing is done!")
    
# 保存处理后的数据
processed_data.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3preprocessed_02_data0626.csv", index=False)
joblib.dump(encoder, "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3encoder0626.pkl")
print("Processed data and encoder are saved.")


In [None]:
# 加载预处理后的数据和编码器
file_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3preprocessed_02_data0626.csv"
data = pd.read_csv(file_path)
encoder = joblib.load("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3encoder0626.pkl")


# 获取日期范围
max_date = data['D'].max()
valid_start_date = max_date - 70-140-100
test_start_date = max_date - 41

# 初始化空的预测结果数据框
test = pd.DataFrame()
valid = pd.DataFrame()


In [None]:
import pandas as pd

def construct_date(year, month, day):
    return pd.to_datetime(dict(year=year, month=month, day=day))

def get_date_range(df):
    min_date = df['constructed_date'].min()
    max_date = df['constructed_date'].max()
    return min_date, max_date

def main():
    # 加载数据
    file_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3preprocessed_02_data0626.csv"
    data = pd.read_csv(file_path)
    
    # 构造日期列
    data['constructed_date'] = construct_date(data['Year'], data['Month'], data['day'])
    
    # 获取日期范围
    max_date = data['D'].max()
    valid_start_date = max_date - 70 - 140 - 100
    test_start_date = max_date - 41

    # 划分数据集
    train_df = data[data['D'] < valid_start_date]
    valid_df = data[(data['D'] >= valid_start_date) & (data['D'] < test_start_date)]
    test_df = data[data['D'] >= test_start_date]

    # 获取各个数据集的日期范围和数据数量
    train_min_date, train_max_date = get_date_range(train_df)
    valid_min_date, valid_max_date = get_date_range(valid_df)
    test_min_date, test_max_date = get_date_range(test_df)
    
    # 输出训练集信息
    print(f"训练集日期范围: 从{train_min_date.date()}到{train_max_date.date()}")
    print(f"训练集数据数量: {len(train_df)}条")
    
    # 输出验证集信息
    print(f"验证集日期范围: 从{valid_min_date.date()}到{valid_max_date.date()}")
    print(f"验证集数据数量: {len(valid_df)}条")
    
    # 输出测试集信息
    print(f"测试集日期范围: 从{test_min_date.date()}到{test_max_date.date()}")
    print(f"测试集数据数量: {len(test_df)}条")

if __name__ == "__main__":
    main()


In [None]:
def plot_logloss(model, store):
    evals_result = model.evals_result_
    plt.figure(figsize=(10, 6))
    plt.plot(evals_result['training']['rmse'], label='Train RMSE')
    plt.plot(evals_result['valid_1']['rmse'], label='Test RMSE')
    plt.title('RMSE Log for Store {}'.format(store))
    plt.xlabel('Iterations')
    plt.ylabel('RMSE')
    plt.legend()
    plt.show()
    
def plot_predictions(y_true, y_pred, title, num_points=500):
    y_pred = np.maximum(y_pred, 0)
    x_axis = np.linspace(1, len(y_true), len(y_true))
    plt.figure(dpi=300, figsize=(28, 12))
    plt.plot(x_axis[:num_points], y_true.values[:num_points], color='blue', label='True')
    plt.plot(x_axis[:num_points], y_pred[:num_points], color='red', linestyle='--', label='Prediction')
    plt.legend(prop={'size': 20})
    plt.xlabel('Time', fontsize=20)
    plt.ylabel('Order Quantity', fontsize=20)
    plt.title(title, fontdict={'family': 'SimSun', 'weight': 'normal', 'size': 20})
    plt.tick_params(labelsize=20)
    plt.grid(True)
    plt.show()

def train_and_evaluate(model, x_train, x_valid, y_train, y_valid, store):
    model.fit(x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)], eval_metric='rmse')
    
    # 评估训练集性能
    y_train_pred = model.predict(x_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_mape = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100

    # 评估验证集性能
    y_valid_pred = model.predict(x_valid)
    valid_mse = mean_squared_error(y_valid, y_valid_pred)
    valid_mae = mean_absolute_error(y_valid, y_valid_pred)
    valid_r2 = r2_score(y_valid, y_valid_pred)
    valid_rmse = np.sqrt(valid_mse)
    valid_mape = np.mean(np.abs((y_valid - y_valid_pred) / y_valid)) * 100

    # 绘制预测与真实值对比图
    plot_predictions(y_valid, y_valid_pred, f'真实值与预测值对比 for Store {store} (测试集)', num_points=500)

    return {
        "train": {"MSE": train_mse, "MAE": train_mae, "R2": train_r2, "MAPE": train_mape, "RMSE": train_rmse},
        "valid": {"MSE": valid_mse, "MAE": valid_mae, "R2": valid_r2, "MAPE": valid_mape, "RMSE": valid_rmse}
    }

In [None]:
# 建立每个销售区域的模型
states = list(set(data['sales_region_code']))
for store in states:
    try:
        df = data[data['sales_region_code'] == store]
        print('Processing sales region:', store)
        print('Data length:', len(df))
        
        # 划分数据
        X_train = df[df['D'] < valid_start_date].drop('ord_qty', axis=1)
        y_train = df[df['D'] < valid_start_date]['ord_qty']
        X_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)].drop('ord_qty', axis=1)
        y_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)]['ord_qty']
        X_test = df[df['D'] >= test_start_date].drop('ord_qty', axis=1)

        # 训练并验证模型
        model = LGBMRegressor(
            n_estimators=1000,
            learning_rate=0.3,
            subsample=0.8,
            colsample_bytree=0.8,
            max_depth=8,
            num_leaves=50,
            min_child_weight=300,
            verbose=-1
        )
        print('*****Prediction for sales region: {}*****'.format(store))
        evaluation_results = train_and_evaluate(model, X_train, X_valid, y_train, y_valid, store)
        print("Training set evaluation:", evaluation_results["train"])
        print("Validation set evaluation:", evaluation_results["valid"])

        valid_preds = pd.Series(index=X_valid.index, data=model.predict(X_valid))
        eval_preds = pd.Series(index=X_test.index, data=model.predict(X_test))
        
        # 确保所有预测值为非负数
        eval_preds = eval_preds.apply(lambda x: max(x, 0))
        valid_preds = valid_preds.apply(lambda x: max(x, 0))

        # 保存模型
        filename = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3model0626_" + str(store) + ".pkl"
        joblib.dump(model, filename)
        plot_logloss(model, store)

        # 保存预测结果
        test = pd.concat([test, pd.DataFrame({'predicted_ord_qty': eval_preds})])
        valid = pd.concat([valid, pd.DataFrame({'predicted_ord_qty': valid_preds})])

        del model, X_train, y_train, X_valid, y_valid
    except Exception as e:
        print(f"Error for store {store}: {e}")
        continue

In [None]:
# 提取相应的日期信息并添加到预测结果中
test['order_date'] = data.loc[test.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())
valid['order_date'] = data.loc[valid.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())

# 添加 sales_region_code, item_code, first_cate_code, second_cate_code 信息
test[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[test.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
valid[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[valid.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]

# 保存预测结果
test.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3_0626predicted_2019_data.csv", index=False)
valid.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/test3_0626valid_2019_data.csv", index=False)
print("预测结果已保存到文件 predicted_2019_data.csv")



In [None]:
################# 下面四个模型########################################
################# RF随机森林、XGBoost、CatBoost、决策树DT

import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
import joblib
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder

def plot_logloss(model, store, model_name):
    if hasattr(model, 'evals_result_'):
        evals_result = model.evals_result_
        plt.figure(figsize=(10, 6))
        plt.plot(evals_result['validation_0']['rmse'], label='Train RMSE')
        plt.plot(evals_result['validation_1']['rmse'], label='Valid RMSE')
        plt.title('RMSE Log for Store {} with {}'.format(store, model_name))
        plt.xlabel('Iterations')
        plt.ylabel('RMSE')
        plt.legend()
        plt.show()
    else:
        print(f"No evals_result_ available for {model_name}")

def plot_predictions(y_true, y_pred, title, num_points=500):
    y_pred = np.maximum(y_pred, 0)
    x_axis = np.linspace(1, len(y_true), len(y_true))
    plt.figure(dpi=300, figsize=(28, 12))
    plt.plot(x_axis[:num_points], y_true.values[:num_points], color='blue', label='True')
    plt.plot(x_axis[:num_points], y_pred[:num_points], color='red', linestyle='--', label='Prediction')
    plt.legend(prop={'size': 20})
    plt.xlabel('Time', fontsize=20)
    plt.ylabel('Order Quantity', fontsize=20)
    plt.title(title, fontdict={'family': 'SimSun', 'weight': 'normal', 'size': 20})
    plt.tick_params(labelsize=20)
    plt.grid(True)
    plt.show()

def train_and_evaluate(model, x_train, x_valid, y_train, y_valid, store, model_name):
    if model_name in ["XGBoost"]:
        if x_train.empty or x_valid.empty:
            raise ValueError(f"No samples available for store {store} with {model_name}")
        model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    elif model_name in ["CatBoost"]:
        model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)])
    else:
        # 处理缺失值
        x_train = x_train.fillna(x_train.mean())
        x_valid = x_valid.fillna(x_valid.mean())
        model.fit(x_train, y_train)
    
    # 评估训练集性能
    y_train_pred = model.predict(x_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_mape = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100

    # 评估验证集性能
    y_valid_pred = model.predict(x_valid)
    valid_mse = mean_squared_error(y_valid, y_valid_pred)
    valid_mae = mean_absolute_error(y_valid, y_valid_pred)
    valid_r2 = r2_score(y_valid, y_valid_pred)
    valid_rmse = np.sqrt(valid_mse)
    valid_mape = np.mean(np.abs((y_valid - y_valid_pred) / y_valid)) * 100

    # 绘制预测与真实值对比图
    plot_predictions(y_valid, y_valid_pred, f'真实值与预测值对比 for Store {store} with {model_name} (测试集)', num_points=500)

    return {
        "train": {"MSE": train_mse, "MAE": train_mae, "R2": train_r2, "MAPE": train_mape, "RMSE": train_rmse},
        "valid": {"MSE": valid_mse, "MAE": valid_mae, "R2": valid_r2, "MAPE": valid_mape, "RMSE": valid_rmse}
    }


In [None]:
# 划分数据集并训练RandomForest模型
for store in states:
    try:
        df = data[data['sales_region_code'] == store]
        print('Processing sales region:', store)
        print('Data length:', len(df))
        
        # 划分数据
        X_train = df[df['D'] < valid_start_date].drop('ord_qty', axis=1)
        y_train = df[df['D'] < valid_start_date]['ord_qty']
        X_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)].drop('ord_qty', axis=1)
        y_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)]['ord_qty']
        X_test = df[df['D'] >= test_start_date].drop('ord_qty', axis=1)

        # 训练并验证模型
        model = RandomForestRegressor(n_estimators=1000, max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=42, n_jobs=-1)
        print(f'*****Prediction for sales region: {store} with RandomForest*****')
        evaluation_results = train_and_evaluate(model, X_train, X_valid, y_train, y_valid, store, "RandomForest")
        print("Training set evaluation:", evaluation_results["train"])
        print("Validation set evaluation:", evaluation_results["valid"])

        valid_preds = pd.Series(index=X_valid.index, data=model.predict(X_valid))
        eval_preds = pd.Series(index=X_test.index, data=model.predict(X_test))
        
        # 确保所有预测值为非负数
        eval_preds = eval_preds.apply(lambda x: max(x, 0))
        valid_preds = valid_preds.apply(lambda x: max(x, 0))

        # 保存模型
        filename = f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/RandomForest_model_" + str(store) + ".pkl"
        joblib.dump(model, filename)

        # 保存预测结果
        test = pd.concat([test, pd.DataFrame({'predicted_ord_qty': eval_preds})])
        valid = pd.concat([valid, pd.DataFrame({'predicted_ord_qty': valid_preds})])

        del model, X_train, y_train, X_valid, y_valid
    except Exception as e:
        print(f"Error for store {store} with RandomForest: {e}")
        continue

# 提取相应的日期信息并添加到预测结果中
test['order_date'] = data.loc[test.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())
valid['order_date'] = data.loc[valid.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())

# 添加 sales_region_code, item_code, first_cate_code, second_cate_code 信息
test[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[test.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
valid[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[valid.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]

# 保存预测结果
test.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/RF_predicted_2019_data.csv", index=False)
valid.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/RF_valid_2019_data.csv", index=False)
print("预测结果已保存到文件 predicted_2019_data.csv")


In [None]:
# 划分数据集并训练XGBoost模型
for store in states:
    try:
        df = data[data['sales_region_code'] == store]
        print('Processing sales region:', store)
        print('Data length:', len(df))
        
        # 划分数据
        X_train = df[df['D'] < valid_start_date].drop('ord_qty', axis=1)
        y_train = df[df['D'] < valid_start_date]['ord_qty']
        X_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)].drop('ord_qty', axis=1)
        y_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)]['ord_qty']
        X_test = df[df['D'] >= test_start_date].drop('ord_qty', axis=1)

        if X_train.empty or X_valid.empty:
            print(f"No samples available for store {store} with XGBoost")
            continue
        
        # 训练并验证模型
        model = XGBRegressor(n_estimators=1000, learning_rate=0.1, max_depth=6, subsample=0.8, colsample_bytree=0.8, random_state=42, n_jobs=-1, eval_metric='rmse')
        print(f'*****Prediction for sales region: {store} with XGBoost*****')
        evaluation_results = train_and_evaluate(model, X_train, X_valid, y_train, y_valid, store, "XGBoost")
        print("Training set evaluation:", evaluation_results["train"])
        print("Validation set evaluation:", evaluation_results["valid"])

        valid_preds = pd.Series(index=X_valid.index, data=model.predict(X_valid))
        eval_preds = pd.Series(index=X_test.index, data=model.predict(X_test))
        
        # 确保所有预测值为非负数
        eval_preds = eval_preds.apply(lambda x: max(x, 0))
        valid_preds = valid_preds.apply(lambda x: max(x, 0))

        # 保存模型
        filename = f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/XGBoost_model_" + str(store) + ".pkl"
        joblib.dump(model, filename)

        # 绘制损失函数图像
        #plot_logloss2(model, store, "XGBoost")

        # 保存预测结果
        test = pd.concat([test, pd.DataFrame({'predicted_ord_qty': eval_preds})])
        valid = pd.concat([valid, pd.DataFrame({'predicted_ord_qty': valid_preds})])

        del model, X_train, y_train, X_valid, y_valid
    except Exception as e:
        print(f"Error for store {store} with XGBoost: {e}")
        continue

# 提取相应的日期信息并添加到预测结果中
test['order_date'] = data.loc[test.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())
valid['order_date'] = data.loc[valid.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())

# 添加 sales_region_code, item_code, first_cate_code, second_cate_code 信息
test[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[test.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
valid[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[valid.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]

# 保存预测结果
test.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/XGBoost_predicted_2019_data.csv", index=False)
valid.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/XGBoost_valid_2019_data.csv", index=False)
print("预测结果已保存到文件 predicted_2019_data.csv")


In [None]:
def train_and_evaluate(model, x_train, x_valid, y_train, y_valid, store, model_name):
    if model_name in ["XGBoost"]:
        if x_train.empty or x_valid.empty:
            raise ValueError(f"No samples available for store {store} with {model_name}")
        model.fit(x_train, y_train, eval_set=[(x_valid, y_valid)], verbose=False)
    elif model_name in ["CatBoost"]:
        if x_train.empty or y_train.empty or x_valid.empty or y_valid.empty:
            raise ValueError(f"No samples available for store {store} with {model_name}")
        model.fit(x_train, y_train, eval_set=(x_valid, y_valid), verbose=False)
    else:
        # 处理缺失值
        x_train = x_train.fillna(x_train.mean())
        x_valid = x_valid.fillna(x_valid.mean())
        model.fit(x_train, y_train)
    
    # 评估训练集性能
    y_train_pred = model.predict(x_train)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_mape = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100

    # 评估验证集性能
    y_valid_pred = model.predict(x_valid)
    valid_mse = mean_squared_error(y_valid, y_valid_pred)
    valid_mae = mean_absolute_error(y_valid, y_valid_pred)
    valid_r2 = r2_score(y_valid, y_valid_pred)
    valid_rmse = np.sqrt(valid_mse)
    valid_mape = np.mean(np.abs((y_valid - y_valid_pred) / y_valid)) * 100

    # 绘制预测与真实值对比图
    plot_predictions(y_valid, y_valid_pred, f'真实值与预测值对比 for Store {store} with {model_name} (验证集)', num_points=500)

    return {
        "train": {"MSE": train_mse, "MAE": train_mae, "R2": train_r2, "MAPE": train_mape, "RMSE": train_rmse},
        "valid": {"MSE": valid_mse, "MAE": valid_mae, "R2": valid_r2, "MAPE": valid_mape, "RMSE": valid_rmse}
    }


# 划分数据集并训练CatBoost模型
# 划分数据集并训练CatBoost模型
for store in states:
    try:
        df = data[data['sales_region_code'] == store]
        print('Processing sales region:', store)
        print('Data length:', len(df))
        
        # 划分数据
        X_train = df[df['D'] < valid_start_date].drop('ord_qty', axis=1)
        y_train = df[df['D'] < valid_start_date]['ord_qty']
        X_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)].drop('ord_qty', axis=1)
        y_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)]['ord_qty']
        X_test = df[df['D'] >= test_start_date].drop('ord_qty', axis=1)
        
        # 确保数据集不为空
        if X_train.empty or y_train.empty or X_valid.empty or y_valid.empty:
            print(f"No samples available for store {store} with CatBoost")
            continue
        
        # 训练并验证模型
        model = CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=6, random_seed=42, verbose=0)
        print(f'*****Prediction for sales region: {store} with CatBoost*****')
        evaluation_results = train_and_evaluate(model, X_train, X_valid, y_train, y_valid, store, "CatBoost")
        print("Training set evaluation:", evaluation_results["train"])
        print("Validation set evaluation:", evaluation_results["valid"])

        valid_preds = pd.Series(index=X_valid.index, data=model.predict(X_valid))
        eval_preds = pd.Series(index=X_test.index, data=model.predict(X_test))
        
        # 确保所有预测值为非负数
        eval_preds = eval_preds.apply(lambda x: max(x, 0))
        valid_preds = valid_preds.apply(lambda x: max(x, 0))

        # 保存模型
        filename = f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/CatBoost_model_" + str(store) + ".pkl"
        joblib.dump(model, filename)

        # 保存预测结果
        test = pd.concat([test, pd.DataFrame({'predicted_ord_qty': eval_preds})])
        valid = pd.concat([valid, pd.DataFrame({'predicted_ord_qty': valid_preds})])

        del model, X_train, y_train, X_valid, y_valid
    except Exception as e:
        print(f"Error for store {store} with CatBoost: {e}")
        continue

# 提取相应的日期信息并添加到预测结果中
test['order_date'] = data.loc[test.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())
valid['order_date'] = data.loc[valid.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())

# 添加 sales_region_code, item_code, first_cate_code, second_cate_code 信息
test[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[test.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
valid[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[valid.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]

# 保存预测结果
test.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Cat_predicted_2019_data.csv", index=False)
valid.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Cat_valid_2019_data.csv", index=False)
print("预测结果已保存到文件 predicted_2019_data.csv")


In [None]:

import pandas as pd

# 读取预测数据
predicted_data_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Cat_predicted_2019_data.csv"
predicted_data = pd.read_csv(predicted_data_path)

# 筛选出2019年的数据
predicted_data['order_date'] = pd.to_datetime(predicted_data['order_date'])
predicted_data_2019 = predicted_data[predicted_data['order_date'].dt.year == 2019]

# 加载原始数据
file_path = "D:/ProgrammingLeaning/JupyterLearning/order_train1.csv"
original_data = pd.read_csv(file_path)

# 获取原始数据中的产品编码
original_item_codes = set(original_data['item_code'].unique())

# 找出新品（在原始数据中不存在的产品编码）
new_items = predicted_data_2019[~predicted_data_2019['item_code'].isin(original_item_codes)]

# 统计新品数量
new_item_count = new_items['item_code'].nunique()

# 用同一销售区域、第一类别和第二类别分组的平均值填充新品的预测需求量
fill_count = 0
for index, row in new_items.iterrows():
    group_mean = predicted_data_2019[(predicted_data_2019['sales_region_code'] == row['sales_region_code']) &
                                     (predicted_data_2019['first_cate_code'] == row['first_cate_code']) &
                                     
                                     (predicted_data_2019['second_cate_code'] == row['second_cate_code'])]['predicted_ord_qty'].mean()
    if not pd.isna(group_mean):
        predicted_data_2019.at[index, 'predicted_ord_qty'] = group_mean
        fill_count += 1

# 将填充后的数据保存回文件
filled_predicted_data_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Cat_predicted_2019_data_filled2.csv"
predicted_data_2019.to_csv(filled_predicted_data_path, index=False)
print(f"新品需求量已填充并保存到文件 {filled_predicted_data_path}")
print(f"共有 {new_item_count} 种新品，新填充了 {fill_count} 条数据")


In [None]:

import pandas as pd

# 读取预测数据
predicted_data_path ="D:/ProgrammingLeaning/JupyterLearning/predict_sku1.csv"
predicted_data = pd.read_csv(predicted_data_path)

# 筛选出2019年的数据
#predicted_data['order_date'] = pd.to_datetime(predicted_data['order_date'])
predicted_data_2019 = predicted_data
# 加载原始数据
file_path = "D:/ProgrammingLeaning/JupyterLearning/order_train1.csv"
original_data = pd.read_csv(file_path)

# 获取原始数据中的产品编码
original_item_codes = set(original_data['item_code'].unique())

# 找出新品（在原始数据中不存在的产品编码）
new_items = predicted_data_2019[~predicted_data_2019['item_code'].isin(original_item_codes)]

# 统计新品数量
new_item_count = new_items['item_code'].nunique()


print(f"共有 {new_item_count} 种新品")


In [None]:
from sklearn.tree import DecisionTreeRegressor

# 划分数据集并训练决策树回归模型
for store in states:
    try:
        df = data[data['sales_region_code'] == store]
        print('Processing sales region:', store)
        print('Data length:', len(df))
        
        # 划分数据
        X_train = df[df['D'] < valid_start_date].drop('ord_qty', axis=1)
        y_train = df[df['D'] < valid_start_date]['ord_qty']
        X_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)].drop('ord_qty', axis=1)
        y_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)]['ord_qty']
        X_test = df[df['D'] >= test_start_date].drop('ord_qty', axis=1)

        if X_train.empty or y_train.empty or X_valid.empty or y_valid.empty:
            print(f"No samples available for store {store} with DecisionTree")
            continue
        
        # 训练并验证模型
        model = DecisionTreeRegressor(max_depth=10, random_state=42)
        print(f'*****Prediction for sales region: {store} with DecisionTree*****')
        evaluation_results = train_and_evaluate(model, X_train, X_valid, y_train, y_valid, store, "DecisionTree")
        print("Training set evaluation:", evaluation_results["train"])
        print("Validation set evaluation:", evaluation_results["valid"])

        valid_preds = pd.Series(index=X_valid.index, data=model.predict(X_valid))
        eval_preds = pd.Series(index=X_test.index, data=model.predict(X_test))
        
        # 确保所有预测值为非负数
        eval_preds = eval_preds.apply(lambda x: max(x, 0))
        valid_preds = valid_preds.apply(lambda x: max(x, 0))

        # 保存模型
        filename = f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/DecisionTree_model_" + str(store) + ".pkl"
        joblib.dump(model, filename)

        # 保存预测结果
        test = pd.concat([test, pd.DataFrame({'predicted_ord_qty': eval_preds})])
        valid = pd.concat([valid, pd.DataFrame({'predicted_ord_qty': valid_preds})])

        del model, X_train, y_train, X_valid, y_valid
    except Exception as e:
        print(f"Error for store {store} with DecisionTree: {e}")
        continue

# 提取相应的日期信息并添加到预测结果中
test['order_date'] = data.loc[test.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())
valid['order_date'] = data.loc[valid.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())

# 添加 sales_region_code, item_code, first_cate_code, second_cate_code 信息
test[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[test.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
valid[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[valid.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]

# 保存预测结果
test.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/DecisionTree_predicted_2019_data.csv", index=False)
valid.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/DecisionTree_valid_2019_data.csv", index=False)
print("预测结果已保存到文件 predicted_2019_data.csv")


In [None]:
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.model_selection import KFold, cross_val_predict
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt
import joblib

# 定义用于训练和评估的函数
def train_and_evaluate_stacking(model, x_train, y_train, x_valid, y_valid, store, cv=5):
    # 使用交叉验证生成训练集预测结果
    kf = KFold(n_splits=cv)
    y_train_pred = cross_val_predict(model, x_train, y_train, cv=kf)
    
    # 评估训练集性能
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)
    train_mape = np.mean(np.abs((y_train - y_train_pred) / y_train)) * 100

    # 训练模型并生成验证集预测结果
    model.fit(x_train, y_train)
    y_valid_pred = model.predict(x_valid)
    
    # 评估验证集性能
    valid_mse = mean_squared_error(y_valid, y_valid_pred)
    valid_mae = mean_absolute_error(y_valid, y_valid_pred)
    valid_r2 = r2_score(y_valid, y_valid_pred)
    valid_rmse = np.sqrt(valid_mse)
    valid_mape = np.mean(np.abs((y_valid - y_valid_pred) / y_valid)) * 100

    # 绘制预测与真实值对比图
    plot_predictions(y_valid, y_valid_pred, f'真实值与预测值对比 for Store {store} with Stacking (验证集)', num_points=500)

    return {
        "train": {"MSE": train_mse, "MAE": train_mae, "R2": train_r2, "MAPE": train_mape, "RMSE": train_rmse},
        "valid": {"MSE": valid_mse, "MAE": valid_mae, "R2": valid_r2, "MAPE": valid_mape, "RMSE": valid_rmse}
    }

def plot_predictions(y_true, y_pred, title, num_points=500):
    y_pred = np.maximum(y_pred, 0)
    x_axis = np.linspace(1, len(y_true), len(y_true))
    plt.figure(dpi=300, figsize=(28, 12))
    plt.plot(x_axis[:num_points], y_true.values[:num_points], color='blue', label='True')
    plt.plot(x_axis[:num_points], y_pred[:num_points], color='red', linestyle='--', label='Prediction')
    plt.legend(prop={'size': 20})
    plt.xlabel('Time', fontsize=20)
    plt.ylabel('Order Quantity', fontsize=20)
    plt.title(title, fontdict={'family': 'SimSun', 'weight': 'normal', 'size': 20})
    plt.tick_params(labelsize=20)
    plt.grid(True)
    plt.show()

# 加载模型
lgbm_models = {}
xgb_models = {}
#rf_models = {}
catboost_models = {}
dt_models = {}

for store in states:
    try:
        lgbm_models[store] = joblib.load(f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/LGBM_model_{store}.pkl")
        xgb_models[store] = joblib.load(f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/XGBoost_model_{store}.pkl")
        #rf_models[store] = joblib.load(f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/RandomForest_model_{store}.pkl")
        catboost_models[store] = joblib.load(f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/CatBoost_model_{store}.pkl")
        dt_models[store] = joblib.load(f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/DecisionTree_model_{store}.pkl")
    except Exception as e:
        print(f"Error loading model for store {store}: {e}")

# 获取日期范围
max_date = data['D'].max()
valid_start_date = max_date - 70 - 140 - 100
test_start_date = max_date - 41

# 初始化空的预测结果数据框
test = pd.DataFrame()
valid = pd.DataFrame()

# 训练 Stacking 模型
for store in states:
    try:
        df = data[data['sales_region_code'] == store]
        print('Processing sales region:', store)
        print('Data length:', len(df))
        
        # 划分数据
        X_train = df[df['D'] < valid_start_date].drop('ord_qty', axis=1)
        y_train = df[df['D'] < valid_start_date]['ord_qty']
        X_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)].drop('ord_qty', axis=1)
        y_valid = df[(df['D'] >= valid_start_date) & (df['D'] < test_start_date)]['ord_qty']
        X_test = df[df['D'] >= test_start_date].drop('ord_qty', axis=1)

        if X_train.empty or y_train.empty or X_valid.empty or y_valid.empty:
            print(f"No samples available for store {store} with Stacking")
            continue
        
        # 创建基础模型列表
        estimators = [
            ('lgbm', lgbm_models[store]),
            ('xgb', xgb_models[store]),
            #('rf', rf_models[store]),
            ('catboost', catboost_models[store]),
            ('dt', dt_models[store])
        ]

        # 创建 StackingRegressor
        stacking_model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100, random_state=42))

        # 训练并评估 Stacking 模型
        evaluation_results = train_and_evaluate_stacking(stacking_model, X_train, y_train, X_valid, y_valid, store)
        print(f"Training set evaluation for store {store} with Stacking: {evaluation_results['train']}")
        print(f"Validation set evaluation for store {store} with Stacking: {evaluation_results['valid']}")

        valid_preds = pd.Series(index=X_valid.index, data=stacking_model.predict(X_valid))
        eval_preds = pd.Series(index=X_test.index, data=stacking_model.predict(X_test))
        
        # 确保所有预测值为非负数
        eval_preds = eval_preds.apply(lambda x: max(x, 0))
        valid_preds = valid_preds.apply(lambda x: max(x, 0))

        # 保存 Stacking 模型
        filename = f"D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_model_" + str(store) + ".pkl"
        joblib.dump(stacking_model, filename)

        # 保存预测结果
        test = pd.concat([test, pd.DataFrame({'predicted_ord_qty': eval_preds})])
        valid = pd.concat([valid, pd.DataFrame({'predicted_ord_qty': valid_preds})])

        del stacking_model, X_train, y_train, X_valid, y_valid
    except Exception as e:
        print(f"Error for store {store} with Stacking: {e}")
        continue
# 提取相应的日期信息并添加到预测结果中
test['order_date'] = data.loc[test.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())
valid['order_date'] = data.loc[valid.index, 'D'].apply(lambda x: (pd.to_datetime('2015-09-01') + pd.to_timedelta(x, unit='D')).date())

# 添加 sales_region_code, item_code, first_cate_code, second_cate_code 信息
test[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[test.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
valid[['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']] = data.loc[valid.index, ['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code']]
test.drop_duplicates(inplace=True)
# 保存预测结果
test.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_predicted_2019_data.csv", index=False)
valid.to_csv("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_valid_2019_data.csv", index=False)
print("预测结果已保存到文件 Stacking_predicted_2019_data.csv")

In [None]:

import pandas as pd

# 读取预测数据
predicted_data_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_predicted_2019_data.csv"
predicted_data = pd.read_csv(predicted_data_path)

# 筛选出2019年的数据
predicted_data['order_date'] = pd.to_datetime(predicted_data['order_date'])
predicted_data_2019 = predicted_data[predicted_data['order_date'].dt.year == 2019]

# 加载原始数据
file_path = "D:/ProgrammingLeaning/JupyterLearning/order_train1.csv"
original_data = pd.read_csv(file_path)

# 获取原始数据中的产品编码
original_item_codes = set(original_data['item_code'].unique())

# 找出新品（在原始数据中不存在的产品编码）
new_items = predicted_data_2019[~predicted_data_2019['item_code'].isin(original_item_codes)]

# 统计新品数量
new_item_count = new_items['item_code'].nunique()

# 用同一销售区域、第一类别和第二类别分组的平均值填充新品的预测需求量
fill_count = 0
for index, row in new_items.iterrows():
    group_mean = predicted_data_2019[(predicted_data_2019['sales_region_code'] == row['sales_region_code']) &
                                     (predicted_data_2019['first_cate_code'] == row['first_cate_code']) &
                                     
                                     (predicted_data_2019['second_cate_code'] == row['second_cate_code'])]['predicted_ord_qty'].mean()
    if not pd.isna(group_mean):
        predicted_data_2019.at[index, 'predicted_ord_qty'] = group_mean
        fill_count += 1

# 将填充后的数据保存回文件
filled_predicted_data_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_predicted_2019_data_filled.csv"
predicted_data_2019.to_csv(filled_predicted_data_path, index=False)
print(f"新品需求量已填充并保存到文件 {filled_predicted_data_path}")
print(f"共有 {new_item_count} 种新品，新填充了 {fill_count} 条数据")


In [None]:
import matplotlib.pyplot as plt
import joblib
import numpy as np

# 配置字体
plt.rcParams['font.family'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

# 加载模型
lgbm_model = joblib.load("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/LGBM_model_105.0.pkl")
xgb_model = joblib.load("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/XGBoost_model_105.0.pkl")
dt_model = joblib.load("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/DecisionTree_model_105.0.pkl")
catboost_model = joblib.load("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/CatBoost_model_105.0.pkl")
stacking_model = joblib.load("D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_model_105.0.pkl")

# 提取特征重要性
lgbm_importance = lgbm_model.feature_importances_
xgb_importance = xgb_model.feature_importances_
dt_importance = dt_model.feature_importances_
catboost_importance = catboost_model.get_feature_importance()
features = data.columns.drop('ord_qty')

# 定义绘制特征重要性图的函数
def plot_feature_importance(importances, features, model_name):
    if len(importances) == len(features):
        mask = features != 'demand_trend'
        importances = importances[mask]
        features = features[mask]
    indices = np.argsort(importances)[::-1][:10]
    top_importances = importances[indices]
    top_features = [features[i] for i in indices]
    
    plt.figure(figsize=(12, 8))
    plt.title(f"{model_name} 特征重要性图", fontsize=16)
    bars = plt.bar(range(len(top_importances)), top_importances, align="center", color=plt.cm.tab20.colors)
    plt.xticks(range(len(top_importances)), top_features, rotation=90, fontsize=14)
    plt.yticks(fontsize=14)
    plt.tight_layout()
    plt.show()

# 绘制特征重要性图
plot_feature_importance(lgbm_importance, features, "LightGBM")
plot_feature_importance(xgb_importance, features, "XGBoost")
plot_feature_importance(dt_importance, features, "Decision Tree")
plot_feature_importance(catboost_importance, features, "CatBoost")

# 对于Stacking模型，使用final_estimator的特征重要性
stacking_importance = stacking_model.final_estimator_.feature_importances_

# 确保特征名称与stacking_importance的维度匹配
stacking_features = features[:len(stacking_importance)]

plot_feature_importance(stacking_importance, stacking_features, "Stacking")


In [None]:
import pandas as pd

# 读取最终输出的表格
predicted_data_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_predicted_2019_data_filled.csv"
data = pd.read_csv(predicted_data_path)

# 生成新的表格结构
new_data = data.groupby(['sales_region_code', 'item_code', 'first_cate_code', 'second_cate_code'])['predicted_ord_qty'].sum().reset_index()
new_data.rename(columns={'predicted_ord_qty': '2019年1月预测需求量'}, inplace=True)

# 保存新的表格
new_data_path = "D:/ProgrammingLeaning/JupyterLearning/产品订单processed_data/Stacking_predicted_2019_data_Monthsummary.csv"
new_data.to_csv(new_data_path, index=False)

print("新的表格已保存到文件 Stacking_predicted_2019_data_Monthsummary.csv")
