In [4]:
library(tidyverse)

# Here we train our final model using the parameters from before.
grid_search_result <- read.csv("../outputs/B_outputs/B11_lgb_grid_kyoto3.csv")

best_logloss <- grid_search_result[which(grid_search_result$binary_logloss == min(grid_search_result$binary_logloss)), ]

best_auc <- grid_search_result[which(grid_search_result$auc == max(grid_search_result$auc)), ]

best_berror <- grid_search_result[which(grid_search_result$binary_error == min(grid_search_result$binary_error)), ]

best_params <- rbind(best_logloss, best_auc, best_berror)
best_params


── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.3.6      [32m✔[39m [34mpurrr  [39m 0.3.4 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.1      [32m✔[39m [34mstringr[39m 1.4.1 
[32m✔[39m [34mreadr  [39m 2.1.2      [32m✔[39m [34mforcats[39m 0.5.2 
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Unnamed: 0_level_0,boostings,learning_rates,max_bins,num_leaves,max_depth,iteration,binary_logloss,auc,binary_error
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>
105,dart,0.1,25,15,10,187,0.347,0.917,0.153
35,dart,0.01,25,15,-1,996,0.351,0.922,0.152
63,dart,0.1,15,20,-1,57,0.367,0.905,0.151


In [5]:
# params

boosting <- as.character(best_params[2, "boostings"])
learning_rate <- as.numeric(best_params[2, "learning_rate"])
max_bin <- as.numeric(best_params[2, "max_bins"])
num_leaves <- as.numeric(best_params[2, "num_leaves"])
max_depth <- as.numeric(best_params[2, "max_depth"])

seed <- 42

# load data
train_val_set <- read.csv("../outputs/B_outputs/B11_japan_train_val.csv")
test_set <- read.csv("../outputs/B_outputs/B11_japan_test.csv")

feature_names <- c("tmax", "tmin", "prcp", "month", "day", "daily_Cd", "daily_Ca", "Cd_cumsum", "Ca_cumsum", "lat", "long", "alt")
target_col <- "is_bloom"

In [29]:
library(lightgbm)

# num_boosting_rounds <- 2000L

    dtrain <- lgb.Dataset(
        data = data.matrix(train_val_set[, feature_names])
        , label = train_val_set[[target_col]]
        , params = list(
            min_data_in_bin = 1L
            , max_bin = max_bin
            )
    )

    dtest <- lgb.Dataset(
        data = data.matrix(test_set[, feature_names])
        , label = test_set[[target_col]]
        
    )
    

params <- list(
            objective = "binary"
            , metric = c("binary_logloss", "auc", "binary_error")
            , is_enable_sparse = TRUE
            , min_data_in_leaf = 2L
            , learning_rate = learning_rate
            , boosting = boosting
            , num_leaves = num_leaves
            , max_depth = max_depth
            
    )
valids <- list(test = dtest)
lgb_final <- lgb.train(params = params, data = dtrain, valids = valids, nrounds = 1000L, verbose = -1)

saveRDS.lgb.Booster(lgb_final, file = "../outputs/B_outputs/B21_lgb_final.rds")

[LightGBM] [Info] Number of positive: 261, number of negative: 379
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 238
[LightGBM] [Info] Number of data points in the train set: 640, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.407813 -> initscore=-0.373016
[LightGBM] [Info] Start training from score -0.373016
[1] "[1]:  test's binary_logloss:0.61532  test's auc:0.835429  test's binary_error:0.344444"
[1] "[2]:  test's binary_logloss:0.586297  test's auc:0.833516  test's binary_error:0.344444"
[1] "[3]:  test's binary_logloss:0.558999  test's auc:0.84199  test's binary_error:0.266667"
[1] "[4]:  test's binary_logloss:0.540583  test's auc:0.843084  test's binary_error:0.244444"
[1] "[5]:  test's binary_logloss:0.52904  test's auc:0.835429  test's binary_error:0.266667"


In [50]:
lgb_load <- readRDS.lgb.Booster('../outputs/B_outputs/B21_lgb_final.rds')

pred <- predict(lgb_load, as.matrix(test_set[, feature_names]))
test_set$predicted <- ifelse(pred > 0.5, 1, 0)

library(caret)
confusionMatrix(factor(test_set$predicted), factor(test_set$is_bloom))

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:purrr’:

    lift




Confusion Matrix and Statistics

          Reference
Prediction  0  1
         0 48 13
         1 11 18
                                          
               Accuracy : 0.7333          
                 95% CI : (0.6297, 0.8211)
    No Information Rate : 0.6556          
    P-Value [Acc > NIR] : 0.0726          
                                          
                  Kappa : 0.4003          
                                          
 Mcnemar's Test P-Value : 0.8383          
                                          
            Sensitivity : 0.8136          
            Specificity : 0.5806          
         Pos Pred Value : 0.7869          
         Neg Pred Value : 0.6207          
             Prevalence : 0.6556          
         Detection Rate : 0.5333          
   Detection Prevalence : 0.6778          
      Balanced Accuracy : 0.6971          
                                          
       'Positive' Class : 0               
                                    

In [57]:
# lgb.save(booster = lgb_final, filename = "../B_outputs/B21_lgb_final2.txt", num_iteration = NULL)
# aa <- lgb.load(filename = "../B_outputs/B21_lgb_final2.rds")
# pred2 <- predict(aa, as.matrix(test_set[, feature_names]))
# pred2

ERROR: Error in booster$save_model(filename = filename, num_iteration = num_iteration): Model file ../B_outputs/B21_lgb_final2.txt is not available for writes

