In [1]:
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PowerTransformer
import math
import warnings
warnings.filterwarnings('ignore')

# 處理 data 套件
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold
from scipy.stats import skew
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from mlxtend.regressor import StackingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR

In [2]:
from imbdpack import skew_pro,lr_rmse_ave,lr_rmse_ave_fea,laso_rmse_ave,ElasticNet_rmse_ave,xgb_ave,svr_rmse_ave,rand_ave
from imbdpack import voting_ave,stack_ave,ave

In [3]:
fea_number=4
data=pd.read_csv('./BayesianRidge_Pre_0/Input_A2_024.csv',index_col=0).drop(['Number'], axis=1) ## 改路徑
y=data.Predict
skew_data=skew_pro(data)
num=skew_data.select_dtypes(exclude='object')
numcorr=num.corr()
title=abs(numcorr['Predict']).sort_values(ascending=False).head(fea_number+1).to_frame().index.to_numpy()[1:]
skew_x=skew_data.drop(['Predict'], axis=1)
title

array(['Input_A2_022', 'Input_A2_023', 'Input_A3_024', 'Input_A3_022'],
      dtype=object)

## Linear Regression Baseline

In [4]:
lr_rmse_ave_fea(skew_data,4)

train_rmse: 0.003489222998819841
test_rmse: 0.003417772033754501
test_r2: 0.9344141176272135


## Lasso

In [5]:
laso_rmse_ave(skew_data[title],y,9e-06)

test_rmse_ave: 0.003512979588446882
[0.003065687255470588, 0.003840134616679227, 0.0036056611113197906, 0.003302259540611516, 0.004082868698577859, 0.002855172819991618, 0.004039278998293442, 0.0031952380903410224, 0.003991219327767872, 0.0031522754254158874]


test_r2_ave: 0.9323147289206691
[0.9483080616152897, 0.9155957178090541, 0.9383013258659071, 0.9307848464242967, 0.9256150602238853, 0.9493149879750126, 0.9180951781091415, 0.9269575447153832, 0.925347957689839, 0.9448266087788821]


## ElasticNet

In [6]:
ElasticNet_rmse_ave(skew_data[title],y,[0.005],0.01)

test_rmse_ave: 0.003848843260393845
[0.003285500750275861, 0.003975328275404609, 0.0037687809934287275, 0.003882227465814085, 0.004265407321435517, 0.003431537304458486, 0.004950556204171342, 0.00294131832076943, 0.004712132370766483, 0.0032756435974139104]


test_r2_ave: 0.9184153072310712
[0.9406295606888428, 0.9095481240550942, 0.9325925618745713, 0.9043377183005517, 0.9188151096193398, 0.9267863259329248, 0.8769703585900718, 0.9381053701648998, 0.8959443972358797, 0.9404235458485373]


## SVR

In [7]:
model=SVR(kernel='linear', C=1.3, gamma= 1e-07, epsilon= 0)
svr_rmse_ave(skew_data,4,model)

train_rmse: 0.0037200815263964634
test_rmse: 0.0038325352436332483
test_r2: 0.9172259700493901


## XGB

In [8]:
best_xgb_model = XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.1,
              booster = 'dart', 
              n_estimators = 160, 
              max_depth = 3, 
              min_child_weight = 2,
              seed = 42,
              gamma = 0,
              subsample = 0.7,
              colsample_bytree = 0.9,
              reg_alpha =  0,
              reg_lambda = 1)
xgb_ave(skew_data[title], y,best_xgb_model)

test_rmse_ave: 0.002610116594444569
[0.0020382978556536995, 0.0033939927210324184, 0.002543914919394047, 0.0027353326101223485, 0.001879888575991442, 0.002163334986991561, 0.0035740546294240664, 0.0021585374608674305, 0.00312541664734467, 0.002488395537624009]


test_r2_ave: 0.9610531636955469
[0.9771491470120371, 0.9340684332518828, 0.9692877818170784, 0.9525103393896454, 0.9842304824834702, 0.9709020562071885, 0.9358754781055965, 0.966665949112508, 0.9542231033133646, 0.9656188662626983]


## RandomForest

In [9]:
rand_model = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 50,
            max_depth = 4,
            min_samples_split = 3,
            max_leaf_nodes = 8,
            min_samples_leaf =1,
            random_state = 42) 
rand_ave(skew_x[title], y,rand_model)

test_rmse_ave: 0.002786821328318193
[0.0020348403991184327, 0.0032690581491539206, 0.0026473960863721453, 0.0028936434983259924, 0.0024657709055810953, 0.0023529392579429874, 0.0039783915667526565, 0.0022150893975997755, 0.0035232157973223415, 0.00248786822501258]


test_r2_ave: 0.956100391487311
[0.9772266026440297, 0.9388330431088736, 0.9667383442257832, 0.9468542147571489, 0.9728693851396321, 0.9655779933476849, 0.920545807451898, 0.9648964183196689, 0.9418286698383145, 0.9656334360400753]


In [10]:
lr = LinearRegression()
lasso_mod=Lasso(alpha=9e-06)
elastic_mod=ElasticNet(alpha=[0.005], l1_ratio=0.01)
svr_mod=SVR(kernel='linear', C=1.3, gamma= 1e-07, epsilon= 0)

xgb_mod =  XGBRegressor(
                objective ='reg:squarederror',
              learning_rate = 0.1,
              booster = 'dart', 
              n_estimators = 160, 
              max_depth = 3, 
              min_child_weight = 2,
              seed = 42,
              gamma = 0,
              subsample = 0.7,
              colsample_bytree = 0.9,
              reg_alpha =  0,
              reg_lambda = 1)

random_mod = RandomForestRegressor( 
            criterion ='mse', 
            n_estimators = 50,
            max_depth = 4,
            min_samples_split = 3,
            max_leaf_nodes = 8,
            min_samples_leaf =1,
            random_state = 42)  
# vote_mod = VotingRegressor([ ('Lasso', lasso_mod),('SVR', svr_mod), ('Elastic', elastic_mod), 
#                             ('XGBRegressor', xgb_mod),('RandomForest', random_mod)])
vote_mod = VotingRegressor([ ('Lasso', lasso_mod),('Linear', lr),  
                            ('XGBRegressor', xgb_mod),('RandomForest', random_mod)])

voting_ave(skew_data[title], y,vote_mod)


test_rmse_ave: 0.002704185938555188
[0.002403256029862572, 0.0031465461103375434, 0.0027604054750586227, 0.0025236863276772836, 0.0030222629042488025, 0.0022757347704848345, 0.0034929978993235586, 0.0017121457242294537, 0.003400568026729827, 0.0023042561175993822]


test_r2_ave: 0.9596125554043088
[0.9682336583557672, 0.9433317522719851, 0.9638380551138066, 0.959575046457933, 0.9592414552356323, 0.9677998390389431, 0.9387510826852514, 0.9790274921785862, 0.9458082170699653, 0.9705189556352165]


In [11]:

stack_mod = StackingRegressor(regressors=[lasso_mod, random_mod,  vote_mod], 
                           meta_regressor=xgb_mod, use_features_in_secondary=True
                          )

stack_ave(skew_data[title], y,stack_mod)

test_rmse_ave: 0.002062440518825733
[0.0019272740524785895, 0.002201285424402552, 0.0019408354246244596, 0.0024018026784924586, 0.0012008198404014916, 0.0016561831779749998, 0.0034569198409398565, 0.0009680251503574542, 0.0027353257542928216, 0.0021359338442926436]


test_r2_ave: 0.9746767651103913
[0.97957067253401, 0.9722652472299184, 0.982123461004515, 0.9633854728997068, 0.9935655727013941, 0.9829457945398603, 0.9400097895146503, 0.9932958735735868, 0.9649370369048602, 0.9746687302014115]


In [12]:
ave(skew_data[title],y,0.7,stack_mod,0.15,vote_mod,0.15,xgb_mod)

test_rmse_ave: 0.002128845528386522
[0.001925993213121663, 0.0023464606050558226, 0.002033955155151941, 0.002339150175661986, 0.001520240053397548, 0.0017761784715235236, 0.003388466601896847, 0.0010359320576917143, 0.0028203277451966328, 0.0021017512051675416]


test_r2_ave: 0.9736675452888643
[0.9795978176010164, 0.9684863928065801, 0.9803669050764854, 0.9652707832032804, 0.98968715406814, 0.9803850160928209, 0.9423620951033349, 0.9923222943233295, 0.9627239689244481, 0.9754730256892082]
