# <center>Python: Porównanie algorytmów</center>
---
## 1. Konstrukcja modeli i mierników parametrów
### 1.1 Biblioteki i edycja danych
#### 1.1.1 Biblioteki

In [1]:
import pandas as pd
from statistics import mean, stdev
from scipy.stats import wilcoxon
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices
import time
%load_ext line_profiler
%load_ext memory_profiler

#### 1.1.2 Wczytanie i edycja danych

In [2]:
data_set = pd.read_csv("BIG_Sim_1e6.csv")

print(data_set.head())

# Formuła
formula = "y ~ x0"

# Macierz
y, X = dmatrices(formula, data=data_set, return_type="dataframe")
groups = data_set["fac"]

           y        x0  fac
0  11.564012  0.319636    1
1  21.027267  0.018812    2
2  19.568180  0.831518    3
3  19.491935  0.152270    4
4  12.107259  0.778155    1
           y        x0  fac
0  11.564012  0.319636    1
1  21.027267  0.018812    2
2  19.568180  0.831518    3
3  19.491935  0.152270    4
4  12.107259  0.778155    1


### 1.2 Funkcje
#### 1.2.1 Funkcja tworząca modele

In [3]:
def LMM(model):
    if model == "formula":
        LMM_formula = smf.mixedlm(formula, data_set, groups=groups)
        LMMF_formula = LMM_formula.fit()
        return LMMF_formula

    elif model == "matrix":
        LMM_matrix = sm.MixedLM(y, X, groups=groups)
        LMMF_matrix = LMM_matrix.fit()
        return LMMF_matrix

#### 1.2.2 Funkcja sprawdzająca czas

In [4]:
def check_time(model, n):
    times = []

    for i in range(n):
        start_time = time.time()
        LMM(model)
        end_time = time.time()
        times.append(round(end_time - start_time, 4))
    return times

## 2. Symulacje
### 2.1 Podsumowanie modelu
#### 2.1.1 statsmodel "formula"

In [5]:
LMM("formula").summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,y
No. Observations:,1000000,Method:,REML
No. Groups:,4,Scale:,15.0989
Min. group size:,250000,Log-Likelihood:,-2776273.8789
Max. group size:,250000,Converged:,Yes
Mean group size:,250000.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,15.390,1.943,7.921,0.000,11.582,19.198
x0,-0.041,0.013,-3.022,0.003,-0.067,-0.014
Group Var,15.099,3.189,,,,


0,1,2,3
Model:,MixedLM,Dependent Variable:,y
No. Observations:,1000000,Method:,REML
No. Groups:,4,Scale:,15.0989
Min. group size:,250000,Log-Likelihood:,-2776273.8789
Max. group size:,250000,Converged:,Yes
Mean group size:,250000.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,15.390,1.943,7.921,0.000,11.582,19.198
x0,-0.041,0.013,-3.022,0.003,-0.067,-0.014
Group Var,15.099,3.189,,,,


#### 2.1.2 statsmodel "matrix"

In [6]:
LMM("matrix").summary()

0,1,2,3
Model:,MixedLM,Dependent Variable:,y
No. Observations:,1000000,Method:,REML
No. Groups:,4,Scale:,15.0989
Min. group size:,250000,Log-Likelihood:,-2776273.8789
Max. group size:,250000,Converged:,Yes
Mean group size:,250000.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,15.390,1.943,7.921,0.000,11.582,19.198
x0,-0.041,0.013,-3.022,0.003,-0.067,-0.014
Group Var,15.099,3.189,,,,


0,1,2,3
Model:,MixedLM,Dependent Variable:,y
No. Observations:,1000000,Method:,REML
No. Groups:,4,Scale:,15.0989
Min. group size:,250000,Log-Likelihood:,-2776273.8789
Max. group size:,250000,Converged:,Yes
Mean group size:,250000.0,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,15.390,1.943,7.921,0.000,11.582,19.198
x0,-0.041,0.013,-3.022,0.003,-0.067,-0.014
Group Var,15.099,3.189,,,,


### 2.2 Sprawdzenie czasu konstrukcji modelu
#### 2.2.1 formula - 100 razy

In [8]:
formula_times = check_time("formula", 100)

pd.DataFrame.to_clipboard(pd.DataFrame(formula_times))

print(round(mean(formula_times), 4),
      round(stdev(formula_times), 4))

1.8467 0.2006
1.8467 0.2006


In [9]:
%timeit LMM("formula")

1.78 s ± 92.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.78 s ± 92.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


1.86 s ± 93.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [10]:
%lprun -f LMM LMM("formula")

Timer unit: 1e-07 s

Total time: 2.93819 s
File: <ipython-input-3-55d36bd81f50>
Function: LMM at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def LMM(model):
     2         1        109.0    109.0      0.0      if model == "formula":
     3         1   21009131.0 21009131.0     71.5          LMM_formula = smf.mixedlm(formula, data_set, groups=groups)
     4         1    8372628.0 8372628.0     28.5          LMMF_formula = LMM_formula.fit()
     5         1         17.0     17.0      0.0          return LMMF_formula
     6                                           
     7                                               elif model == "matrix":
     8                                                   LMM_matrix = sm.MixedLM(y, X, groups=groups)
     9                                                   LMMF_matrix = LMM_matrix.fit()
    10                                                   return LMMF_matrix

#### 2.2.2 matrix - 100 razy

In [11]:
matrix_times = check_time("matrix", 100)

pd.DataFrame.to_clipboard(pd.DataFrame(matrix_times))

print(round(mean(matrix_times), 4),
      round(stdev(matrix_times), 4))

1.6672 0.2507
1.6672 0.2507


In [12]:
%timeit LMM("matrix")

1.62 s ± 73.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.62 s ± 73.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


1.63 s ± 74.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [13]:
%lprun -f LMM LMM("matrix")

Timer unit: 1e-07 s

Total time: 2.51752 s
File: <ipython-input-3-55d36bd81f50>
Function: LMM at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
==============================================================
     1                                           def LMM(model):
     2         1         60.0     60.0      0.0      if model == "formula":
     3                                                   LMM_formula = smf.mixedlm(formula, data_set, groups=groups)
     4                                                   LMMF_formula = LMM_formula.fit()
     5                                                   return LMMF_formula
     6                                           
     7         1        228.0    228.0      0.0      elif model == "matrix":
     8         1   17742568.0 17742568.0     70.5          LMM_matrix = sm.MixedLM(y, X, groups=groups)
     9         1    7432297.0 7432297.0     29.5          LMMF_matrix = LMM_matrix.fit()
    10         1         19.0     19.0      0.0          return LMMF_matrix

### 2.3 Sprawdzenie zużywanej pamięci RAM
#### > Z użyciem "IPython Magic Commands"
##### >> statsmodel "formula"

In [14]:
%memit LMM("formula")

peak memory: 344.95 MiB, increment: 131.51 MiB
peak memory: 344.95 MiB, increment: 131.51 MiB


Line #    Mem usage    Increment   Line Contents
================================================
    22    150.7 MiB    150.7 MiB   @profile
    23                             def LMM(model):
    24    150.7 MiB      0.0 MiB       if model == "formula":
    25    267.0 MiB    116.3 MiB           LMM_formula = smf.mixedlm(formula, data_set, groups=groups)
    26    293.9 MiB     26.9 MiB           LMMF_formula = LMM_formula.fit()
    27    293.9 MiB      0.0 MiB           return LMMF_formula
    28                             
    29                                 elif model == "matrix":
    30                                     LMM_matrix = sm.MixedLM(y, X, groups=groups)
    31                                     LMMF_matrix = LMM_matrix.fit()
    32                                     return LMMF_matrix

##### >> statsmodel "matrix"

In [15]:
%memit LMM("matrix")

peak memory: 323.17 MiB, increment: 108.24 MiB
peak memory: 323.17 MiB, increment: 108.24 MiB


Line #    Mem usage    Increment   Line Contents
================================================
    22    150.7 MiB    150.7 MiB   @profile
    23                             def LMM(model):
    24    150.7 MiB      0.0 MiB       if model == "formula":
    25                                     LMM_formula = smf.mixedlm(formula, data_set, groups=groups)
    26                                     LMMF_formula = LMM_formula.fit()
    27                                     return LMMF_formula
    28                             
    29    150.7 MiB      0.0 MiB       elif model == "matrix":
    30    229.0 MiB     78.2 MiB           LMM_matrix = sm.MixedLM(y, X, groups=groups)
    31    255.8 MiB     26.9 MiB           LMMF_matrix = LMM_matrix.fit()
    32    255.8 MiB      0.0 MiB           return LMMF_matrix

#### > Z użyciem biblioteki "memory_profiler" w programie PyCharm
##### >> statsmodel "formula
![RAM_formula](https://i.imgur.com/VmMhmfB.png)

##### > statsmodel "matrix"

![RAM_matrix](https://i.imgur.com/c8irBbo.png)