# R: Porównanie bibliotek
# Zbiór danych z 1.000.000 obserwacji
---
## 1. Konstrukcja modeli i mierników parametrów
### 1.1 Biblioteki i edycja danych
#### 1.1.1 Biblioteki

In [1]:
library(dplyr)
library(data.table)
library(lme4)
library(nlme)
library(mgcv)
library(profmem)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union


Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

Loading required package: Matrix

Attaching package: 'nlme'

The following object is masked from 'package:lme4':

    lmList

The following object is masked from 'package:dplyr':

    collapse

This is mgcv 1.8-28. For overview type 'help("mgcv-package")'.


#### 1.1.2 Wczytanie i edycja danych

In [2]:
data_set <- fread("BIG_Sim_1e6.csv")

head(data_set)

y,x0,fac
11.56401,0.31963601,1
21.02727,0.01881216,2
19.56818,0.83151838,3
19.49194,0.15227013,4
12.10726,0.77815483,1
16.6358,0.77547383,2


### 1.2 Funkcje
#### 1.2.1 Funkcja tworząca model

In [3]:
LMM <- function(model) {
  if(model == "lme4") {
    LMM_lme4 <- lmer(data = data_set, y ~ x0 + (1|fac))
    return(LMM_lme4)
  }
  else if(model == "nlme") {
    LMM_nlme <- lme(data = data_set, y ~ x0, random = ~ 1|fac)
    return(LMM_nlme)
  }
  else if(model == "bam") {
    LMM_bam <- bam(data = data_set, y ~ x0 + s(fac, bs = "re"))
    return(LMM_bam)
  }
}

#### 1.2.2 Funkcja sprawdzająca czas

In [4]:
check_time <- function(model, n) {
  times <- c()

  for(i in 1:n) {
    start_time <- Sys.time()
    LMM(model)
    end_time <- Sys.time()
    times <- c(times, round(end_time - start_time, 4))
    }
    return(times)
  }

#### 1.2.3 Funkcja mierząca ilość zużytego RAMu

In [5]:
check_RAM <- function(model) {
  prof_mem <- profmem({
    LMM(model)
      })
  sum_in_MB <- round(sum(prof_mem$bytes[!is.na(prof_mem$bytes)]) * 1e-6, 4)
  return(sum_in_MB)
}

## 2. Symulacje
### 2.1 Podsumowanie modelu
#### 2.1.1 "lme4"

In [6]:
 summary(LMM("lme4"))

Linear mixed model fit by REML ['lmerMod']
Formula: y ~ x0 + (1 | fac)
   Data: data_set

REML criterion at convergence: 5552548

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-3.6007 -0.7211 -0.0664  0.6711  4.3146 

Random effects:
 Groups   Name        Variance Std.Dev.
 fac      (Intercept) 15.03    3.877   
 Residual             15.10    3.886   
Number of obs: 1000000, groups:  fac, 4

Fixed effects:
            Estimate Std. Error t value
(Intercept) 15.39033    1.93838   7.940
x0          -0.04068    0.01346  -3.022

Correlation of Fixed Effects:
   (Intr)
x0 -0.003

#### 2.1.2 "nlme"

In [7]:
summary(LMM("nlme"))

Linear mixed-effects model fit by REML
 Data: data_set 
      AIC     BIC   logLik
  5552556 5552603 -2776274

Random effects:
 Formula: ~1 | fac
        (Intercept) Residual
StdDev:    3.876102 3.885725

Fixed effects: y ~ x0 
                Value Std.Error     DF   t-value p-value
(Intercept) 15.390336 1.9380667 999995  7.941077  0.0000
x0          -0.040677 0.0134595 999995 -3.022182  0.0025
 Correlation: 
   (Intr)
x0 -0.003

Standardized Within-Group Residuals:
        Min          Q1         Med          Q3         Max 
-3.60069458 -0.72111192 -0.06643291  0.67105573  4.31456525 

Number of Observations: 1000000
Number of Groups: 4 

#### 2.1.3 "bam"

In [8]:
summary(LMM("bam"))


Family: gaussian 
Link function: identity 

Formula:
y ~ x0 + s(fac, bs = "re")

Parametric coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.88429    0.01166  675.93  < 2e-16 ***
x0          -0.04064    0.01346   -3.02  0.00253 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Approximate significance of smooth terms:
       edf Ref.df      F p-value    
s(fac)   1      1 746288  <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

R-sq.(adj) =  0.427   Deviance explained = 42.7%
fREML = 2.7763e+06  Scale est. = 15.099    n = 1000000

### 2.2 Sprawdzenie czasu konstrukcji modelu
#### 2.2.1 100 razy z użyciem funkcji "check_time(model, n)"

In [9]:
LMM_lme4_times <- check_time("lme4", 100)
LMM_nlme_times <- check_time("nlme", 100)
LMM_bam_times <- check_time("bam", 100)

data.frame(LMM_lme4_times, LMM_nlme_times, LMM_bam_times) %>%
    summarise(lme4_mean = round(mean(LMM_lme4_times), 4),
              lme4_sd = round(sd(LMM_lme4_times), 4),
              nlme_mean = round(mean(LMM_nlme_times), 4),
              nlme_sd = round(sd(LMM_nlme_times), 4),
              bam_mean = round(mean(LMM_bam_times), 4),
              bam_sd = round(sd(LMM_bam_times), 4))

lme4_mean,lme4_sd,nlme_mean,nlme_sd,bam_mean,bam_sd
9.7396,0.5314,11.4261,0.621,3.9247,5.0389


### 2.3 Sprawdzenie zużytej pamięci RAM
#### 2.3.1 Z użyciem funkcji "check_RAM(model)"

In [10]:
lme4_RAM <- check_RAM("lme4")
nlme_RAM <- check_RAM("nlme")
bam_RAM <- check_RAM("bam")

data.frame(lme4_RAM, nlme_RAM, bam_RAM)

lme4_RAM,nlme_RAM,bam_RAM
1356.555,1425.31,1006.423


#### 2.3.2 Z użyciem wbudowanego w R narzędzia profilowania

https://rpubs.com/kamilpytlak/LMM_1e6