### 4.2. XGBoost 预测

predict 函数：取 2020-01-11 到 last_date 之间的各种数据，使用前 n - 1 天的数据训练并预测各地最后一天的新增确诊人数。

In [1]:
from coronavirus_analyzer import CoronavirusAnalyzer
import numpy as np
import pandas as pd
import xgboost as xgb

def predict(last_date, use_move=True, use_ma3=True, use_weather=False, omit_hubei=True):
    '''
    last_date    加载数据的最后一天
    use_move     是否使用人流数据，函数中 use_data 是算法使用的数据
    use_ma3      是否使用前3天（不含当天）新增确诊人数均值，函数中 use_data 是算法使用的数据
    use_weather  是否使用天气数据，函数中 use_data 是算法使用的数据
    omit_hubei   是否忽略湖北
    '''
    omitted_regions = ['湖北'] if omit_hubei else []
    
    analyzer = CoronavirusAnalyzer(last_date)
    df_virus_daily_inc_injured = analyzer.del_city_special_regions(analyzer.df_virus_daily_inc_injured)
    df_move_in_injured = analyzer.del_city_regions(analyzer.df_move_in_injured)
    df_weather_ma = analyzer.del_city_special_regions(analyzer.df_weather_ma)
    df_virus_daily_injured = analyzer.del_city_special_regions(analyzer.df_virus_daily_injured)
    df_virus_7_days = analyzer.del_city_special_regions(analyzer.df_virus_7_days_inc_injured)
    df_daily_inc_ma3 = analyzer.moving_avg(analyzer.df_virus_daily_inc_injured, window=3, 
                                           shift=1, keep_shape=True).fillna(0)
    # 1 级列索引转 2 级列索引
    if not isinstance(df_move_in_injured.columns, pd.MultiIndex):
        df_move_in_injured.columns = pd.MultiIndex.from_product(
            [df_move_in_injured.columns, ['人流风险系数']])
        df_virus_daily_injured.columns = pd.MultiIndex.from_product(
            [df_virus_daily_injured.columns, ['累计确诊']])
        df_daily_inc_ma3.columns = pd.MultiIndex.from_product(
            [df_daily_inc_ma3.columns, ['3日新增均值']])
    # index 统一从 2020-01-11 到 last_date，并合并 4 个 DataFrame
    index = df_move_in_injured.index
    dfs = []
    
    use_data = [df_virus_7_days]  # 各地 1天前、2天前、......、7天前的每日新增确诊人数，一定会使用
    if use_weather:
        # 各地 14天前、13天前、......、3天前的天气滑动加权平均数据，权重值为 1,2,3,4,5,6,7,8,9,10,9,8
        use_data.append(df_weather_ma)
    if use_move:
        use_data.append(df_move_in_injured)
        # 各地 7 天内（不含当天，即 1天前到7天前，的所有确诊人数作为权重 * 进入该地区的人流规模）
    if use_ma3:
        # 各地 1天前、2天前、3天前 每日新增均值
        use_data.append(df_daily_inc_ma3)
    for df in use_data:
        for region in omitted_regions:
            try:
                del df[region]
            except:
                pass
        if df.shape[0] != index.size:
            df = df.reindex(index)
        dfs.append(df)
    df_trait = pd.concat(dfs, axis=1, sort=False)
    df_trait = df_trait.sort_index(axis=1)

    df_X_train = df_trait.iloc[:-1]
    df_y_train = df_virus_daily_inc_injured.iloc[:-1]
    df_X_test = df_trait.iloc[-1:]
    df_y_test = df_virus_daily_inc_injured.iloc[-1:]
    X_train = None
    y_train = None
    X_test = None
    y_test = None
    regions = []
    for region in df_y_train.columns:
        if region not in omitted_regions:
            regions.append(region)
            arr_X_train = df_X_train[region].values
            arr_y_train = df_y_train[region].values
            arr_X_test = df_X_test[region].values
            arr_y_test = df_y_test[region].values
            if X_train is None:
                X_train = arr_X_train
                y_train = arr_y_train
                X_test = arr_X_test
                y_test = arr_y_test
            else:
                X_train = np.vstack([X_train, arr_X_train])
                y_train = np.hstack([y_train, arr_y_train])
                X_test = np.vstack([X_test, arr_X_test])
                y_test = np.hstack([y_test, arr_y_test])
#     print(X_train.shape, y_train.shape)
    region_cnt = len(regions)

    min_err_rate = 1
    min_err_sum = 1000000
    best_objective = best_max_depth = best_n_estimators = best_learning_rate = None
    objectives = ['reg:gamma']  # 'reg:squarederror', 'count:poisson', 
    for objective in objectives:
#         print('*' * 40, objective, '*' * 40)
        for max_depth in range(5, 15):
        #     print('max_depth: {}'.format(max_depth))
            for n_estimators in range(80, 210, 10):
        #         print('n_estimators: {}'.format(n_estimators))
                for learning_rate in range(10, 11):
                    learning_rate = learning_rate / 100
                    model = xgb.XGBRegressor(
                        max_depth=max_depth, 
                        learning_rate=learning_rate, 
                        n_estimators=n_estimators, 
                        objective=objective)
                    model.fit(X_train, y_train)
                    
                    df_predict = pd.DataFrame(np.array([model.predict(X_test), y_test]).T, 
                                              index=regions, columns=['预测', '实际'])
                    df_predict['预测差'] = df_predict['预测'] - df_predict['实际']
                    df_predict['预测误差百分比'] = df_predict['预测差'] / df_predict['实际']
                    df_predict['预测误差百分比'][df_predict['实际'].values == 0] = 0
                    df_predict['权重'] = region_cnt * df_predict['实际'] / df_predict['实际'].sum()
                    df_predict['加权预测误差百分比'] = df_predict['预测误差百分比'] * df_predict['权重']
                    err_rate = abs(df_predict['加权预测误差百分比'].values).sum() / region_cnt
                    err_sum = abs(df_predict['预测差'].values).sum()
                    if err_rate < min_err_rate or err_sum < min_err_sum:
    #                     print('max_depth：{}，n_estimators：{}，learning_rate：{}，加权预测误差绝对值的均值：{}，'\
    #                           '最小误差绝对值总和为：{}。'
    #                           .format(max_depth, n_estimators, learning_rate, err_rate, err_sum))
                        if min_err_rate > err_rate:
                            min_err_rate = err_rate
                            min_err_sum = err_sum
                            best_max_depth = max_depth
                            best_n_estimators = n_estimators
                            best_learning_rate = learning_rate
                            best_objective = objective
    print('best_objective：{}，best_max_depth：{}，best_n_estimators：{}，best_learning_rate：{}，'\
          '加权预测误差绝对值的均值：{}，最小误差绝对值总和为：{}。'
          .format(best_objective, best_max_depth, best_n_estimators, best_learning_rate, min_err_rate, min_err_sum))

    print(best_max_depth, best_learning_rate, best_n_estimators, best_objective)
    model = xgb.XGBRegressor(
        max_depth=best_max_depth,
        learning_rate=best_learning_rate,
        n_estimators=best_n_estimators,
        objective=best_objective)
    model.fit(X_train, y_train)
    df_predict = pd.DataFrame(np.array([model.predict(X_test), y_test]).T, 
                              index=regions, columns=['预测', '实际'])
    
    df_predict['预测差'] = df_predict['预测'] - df_predict['实际']
    df_predict['预测误差百分比'] = df_predict['预测差'] / df_predict['实际']
    df_predict['预测误差百分比'][df_predict['实际'].values == 0] = 0
    df_predict['权重'] = region_cnt * df_predict['实际'] / df_predict['实际'].sum()
    df_predict['加权预测误差百分比'] = df_predict['预测误差百分比'] * df_predict['权重']
    err_rate = abs(df_predict['加权预测误差百分比'].values).sum() / region_cnt
    
    err_sum = abs(df_predict['预测差'].values).sum()
    print('预测：{}，预测差绝对值的总和：{}，加权预测误差绝对值的均值：{}'.format(last_date, err_sum, err_rate))
    df_predict = df_predict.sort_values('实际', ascending=False)
    return df_predict

In [2]:
import datetime
date = datetime.date(2020, 1, 31)
dfs_predict = {}
while date < datetime.date.today():
    str_date = str(date)
    dfs_predict[str_date] = predict(str_date)
    date += datetime.timedelta(days=1)

2020-02-03 23:25:33,757 - numexpr.utils - INFO - NumExpr defaulting to 4 threads.


best_objective：reg:gamma，best_max_depth：7，best_n_estimators：80，best_learning_rate：0.1，加权预测误差绝对值的均值：0.3072940796691506，最小误差绝对值总和为：236.6759940981865。
7 0.1 80 reg:gamma
预测：2020-01-31，预测差绝对值的总和：236.6759940981865，加权预测误差绝对值的均值：0.3072940796691506




best_objective：reg:gamma，best_max_depth：6，best_n_estimators：90，best_learning_rate：0.1，加权预测误差绝对值的均值：0.22412647961866888，最小误差绝对值总和为：152.04469604045153。
6 0.1 90 reg:gamma
预测：2020-02-01，预测差绝对值的总和：152.04469604045153，加权预测误差绝对值的均值：0.22412647961866888
best_objective：reg:gamma，best_max_depth：11，best_n_estimators：110，best_learning_rate：0.1，加权预测误差绝对值的均值：0.1571839054152267，最小误差绝对值总和为：116.30051600933075。
11 0.1 110 reg:gamma
预测：2020-02-02，预测差绝对值的总和：116.30051600933075，加权预测误差绝对值的均值：0.1571839054152267


随着数据量的增加，各地预测差绝对值的总和、加权预测误差绝对值的均值，都呈下降趋势

In [3]:
dfs_predict['2020-01-31']

Unnamed: 0,预测,实际,预测差,预测误差百分比,权重,加权预测误差百分比
广东,55.856834,127.0,-71.143166,-0.560182,4.97389,-2.786286
河南,76.614006,70.0,6.614006,0.094486,2.741514,0.259034
浙江,61.363144,62.0,-0.636856,-0.010272,2.428198,-0.024942
安徽,62.703529,60.0,2.703529,0.045059,2.349869,0.105882
湖南,55.049698,57.0,-1.950302,-0.034216,2.232376,-0.076383
江西,63.935398,46.0,17.935398,0.3899,1.801567,0.702431
北京,12.292266,35.0,-22.707734,-0.648792,1.370757,-0.889337
江苏,48.448635,34.0,14.448635,0.42496,1.331593,0.565873
重庆,40.439854,32.0,8.439854,0.263745,1.253264,0.330543
四川,34.889519,30.0,4.889519,0.162984,1.174935,0.191496


In [4]:
dfs_predict['2020-02-01']

Unnamed: 0,预测,实际,预测差,预测误差百分比,权重,加权预测误差百分比
广东,80.700531,84.0,-3.299469,-0.039279,3.716814,-0.145994
湖南,72.540108,74.0,-1.459892,-0.019728,3.274336,-0.064597
河南,62.119904,71.0,-8.880096,-0.125072,3.141593,-0.392925
浙江,65.194359,62.0,3.194359,0.051522,2.743363,0.141343
江西,53.547897,47.0,6.547897,0.139317,2.079646,0.28973
安徽,72.283012,43.0,29.283012,0.681,1.902655,1.295709
江苏,39.922009,34.0,5.922009,0.174177,1.504425,0.262036
北京,27.608055,27.0,0.608055,0.022521,1.19469,0.026905
重庆,32.278042,24.0,8.278042,0.344918,1.061947,0.366285
四川,35.749939,24.0,11.749939,0.489581,1.061947,0.519909


In [5]:
dfs_predict['2020-02-02']

Unnamed: 0,预测,实际,预测差,预测误差百分比,权重,加权预测误差百分比
广东,78.614174,79.0,-0.385826,-0.004884,3.259972,-0.015921
河南,71.541008,73.0,-1.458992,-0.019986,3.01238,-0.060206
安徽,73.296074,68.0,5.296074,0.077883,2.806052,0.218545
浙江,63.887245,63.0,0.887245,0.014083,2.599725,0.036613
湖南,72.685951,58.0,14.685951,0.253206,2.393398,0.606023
江西,61.494251,58.0,3.494251,0.060246,2.393398,0.144192
重庆,26.577255,38.0,-11.422745,-0.300599,1.568088,-0.471365
江苏,33.280811,35.0,-1.719189,-0.04912,1.444292,-0.070943
北京,26.188904,29.0,-2.811096,-0.096934,1.196699,-0.116001
黑龙江,14.789755,23.0,-8.210245,-0.356967,0.949106,-0.3388
