## 数据转换

In [178]:
import pandas as pd
import numpy as np

In [179]:
train_feature = pd.read_csv('data/train_feature.csv')
train_label = pd.read_csv('data/train_label.csv')
test_feature = pd.read_csv('data/test_feature.csv')

In [180]:
train_feature.columns

Index(['日期', '时刻', '辐照度', '风速', '风向', '温度', '湿度', '气压'], dtype='object')

In [181]:
def data_process(data):
    for i in range (8):   
        train_feature_new = pd.DataFrame(columns= ['日期', '时刻', '辐照度', 
                                                   '风速', '风向', '温度', 
                                                   '湿度', '气压'])
        for j in range(data.shape[0]//8):
            train_feature_new = train_feature_new.append(data.iloc[i + j*8])
            
        train_feature_new = train_feature_new.drop(['时刻'],axis=1)
        train_feature_new = train_feature_new.rename(index=str, columns={'辐照度':'辐照度_%d'%i, 
                                                     '风速':'风速_%d'%i, '风向':'风向_%d'%i, '温度':'温度_%d'%i, 
                                                    '湿度':'湿度_%d'%i, '气压':'气压_%d'%i})
        if i == 0:
            train_feature_all = train_feature_new
        else:
            train_feature_all = pd.merge(train_feature_all,train_feature_new,on = '日期')
    return train_feature_all

In [182]:
test_feature_all = data_process(test_feature)
train_feature_all = data_process(train_feature)

In [183]:
test_feature_all['belong'] = 1
train_feature_all['belong'] = 0

In [184]:
train_all = pd.merge(train_feature_all,train_label,on='日期')

In [185]:
test_feature_all['电场实际太阳辐射指数'] = -1

In [186]:
all_data =train_all.append(test_feature_all)

In [187]:
all_data.to_csv('data/all_data.csv',index = False)

## 模型

In [188]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import random
import os
import shutil
import warnings
warnings.filterwarnings('ignore')

In [189]:
all_data = pd.read_csv('data/all_data.csv')

In [190]:
def split_data(data):
    train_all = all_data.loc[all_data.belong == 0]
    train_y = train_all['电场实际太阳辐射指数']
    test_all = all_data.loc[all_data.belong == 1]
    return train_all,train_y,test_all

In [191]:
def lightgbm (all_data):
    if os.path.exists('featurescore') == True:
        shutil.rmtree('featurescore')
    if os.path.exists('preds') == True:
        shutil.rmtree('preds')


    os.mkdir('featurescore')
    os.mkdir('preds')
    train_all,train_y,test_all = split_data(all_data)
    date = test_all['日期'].values
    
    
    mae_score = []
    k = 5
    skf = StratifiedKFold(n_splits=k,shuffle=True,random_state=1)
    feats = [feature for feature in train_all.columns.values if feature not in ['日期','电场实际太阳辐射指数','belong']]
    
    train_x = train_all[feats]
    test_x = test_all[feats]
    
    print('train_shape',train_x.shape)
    print('test_shape',test_x.shape)
    
    for k,(train_k,valid_k) in enumerate(skf.split(train_x,np.zeros(shape=(train_x.shape[0], 1)))):
        x_train,y_train,x_valid,y_valid = np.array(train_x)[train_k], np.array(train_y)[train_k], np.array(train_x)[valid_k], np.array(train_y)[valid_k]
        print('###################### train!!! ################################')
        gbm = lgb.LGBMRegressor(num_leaves=25,
                                learning_rate=0.014,
                                n_estimators=10000,
                                max_depth=5,                #限制树模型的最大深度. 这可以在 #data 小的情况下防止过拟合.
                               #min_data_in_leaf=24,         #一个叶子上数据的最小数量. 可以用来处理过拟合          
                               #min_child_weight=1,         #一个叶子上的最小 hessian 和. 类似于 min_data_in_leaf, 可以用来处理过拟合
                               #feature_fraction=0.8,       #如果 feature_fraction 小于 1.0, LightGBM 将会在每次迭代中随机选择部分特征. 
                               #           #例如, 如果设置为 0.8, 将会在每棵树训练之前选择 80% 的特征                             
                               #bagging_fraction=0.8,       #类似于 feature_fraction, 但是它将在不进行重采样的情况下随机选择部分数据.
                               #           #Note: 为了启用 bagging, bagging_freq 应该设置为非零值
                               #bagging_freq=8             #bagging 的频率, 0 意味着禁用 bagging. k 意味着每 k 次迭代执行bagging)
                               )


        gbm.fit(x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                eval_metric='mae',
                early_stopping_rounds=100,
                verbose = 100)

        print('###################### valid!!! ##################################')
        y_pred_val = gbm.predict(x_valid)
        mae = mean_absolute_error(y_valid,y_pred_val)
        print('The mae_score is',mae)
        mae_score.append(mae)
        print('###################### predict!!! ################################')
        test_pred_y = gbm.predict(test_x)
        test_result = pd.DataFrame(columns=["日期","电场实际太阳辐射指数"])
        test_result['日期'] = date
        test_result['电场实际太阳辐射指数'] = test_pred_y
        test_result.to_csv("./preds/lgb{0}.csv".format(k),index=None,encoding='utf-8')
        

            
    print(np.mean(mae_score))
    
    #pred 取平均   
    files = os.listdir('./preds')
    pred = pd.read_csv('./preds/'+files[0])
    pred_prob = pred['电场实际太阳辐射指数']
    for f in files[1:]:
        pred = pd.read_csv('./preds/'+f)
        pred_prob += pred['电场实际太阳辐射指数']

    pred_prob /= len(files)
    #print(pred_prob)
    pred_new = pd.DataFrame(date,columns=['time']).reset_index(drop = True)####注意索引问题
    pred_new['prediction'] = pred_prob
    pred_new.to_csv('preds/avg_preds.csv',index=False,encoding='utf-8')
    return np.mean(mae_score)


In [192]:
lightgbm(all_data)

train_shape (2126, 48)
test_shape (915, 48)
###################### train!!! ################################
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.0477051	valid_0's l1: 0.168063
[200]	valid_0's l2: 0.0480502	valid_0's l1: 0.169028
Early stopping, best iteration is:
[138]	valid_0's l2: 0.0474327	valid_0's l1: 0.167456
###################### valid!!! ##################################
The mae_score is 0.16745585309975167
###################### predict!!! ################################
###################### train!!! ################################
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.0517361	valid_0's l1: 0.179366
[200]	valid_0's l2: 0.0509767	valid_0's l1: 0.17655
[300]	valid_0's l2: 0.051111	valid_0's l1: 0.17645
Early stopping, best iteration is:
[200]	valid_0's l2: 0.0509767	valid_0's l1: 0.17655
###################### valid!!! ##################################
The mae_score is 0.176550

0.1702196956636791

In [131]:
from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMRegressor()
param_grid = {
    'num_leaves' : [27],
    #'learning_rate' : [i/1000 for i in range(5,15)],
    'learning_rate' : [0.014],
    #'max_depth' : [i for i in range(3,7)],
    'max_depth' : [5],
    #'min_data_in_leaf' :[i for i in range(10,30)],
    'min_data_in_leaf' :[24],
    #'min_child_weight' : [i for i in range(1,4)],
    'min_child_weight' : [3],#不好调，容易拟合
    'feature_fraction' :[0.8],
    'bagging_fraction':  [0.8],
    'bagging_freq' : [8]
}
train_x,train_y,test_x = split_data(all_data)
gbm = GridSearchCV(estimator, param_grid, cv=5,return_train_score=True)

gbm.fit(train_x, train_y)

GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'num_leaves': [27], 'learning_rate': [0.014], 'max_depth': [5], 'min_data_in_leaf': [24], 'min_child_weight': [3], 'feature_fraction': [0.8], 'bagging_fraction': [0.8], 'bagging_freq': [1, 2, 3, 4, 5, 6, 7, 8, 9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [193]:
gbm.best_params_

{'bagging_fraction': 0.8,
 'bagging_freq': 8,
 'feature_fraction': 0.8,
 'learning_rate': 0.014,
 'max_depth': 5,
 'min_child_weight': 3,
 'min_data_in_leaf': 24,
 'num_leaves': 27}