# Lượng giảng lập trình hàm thống kê: 
# Bài 2: Trích xuất hàm tính toán từ hành động

**BS. Lê Ngọc Khả Nhi**

In [1]:
import pandas as pd
import numpy as np
from typing import List,Dict
import statsmodels.api as sm
from patsy import dmatrices
from __future__ import annotations

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/kinokoberuji/R-Tutorials/master/aerodim.csv', sep = ';')

df.head()

Unnamed: 0,Poids,Taille,BMI,Age,Sexe,Hb,Diagnostic,GST,Volume,Surface,DmCO,Thickness,DLCO,FVC
0,53,165,19.467401,54,F,13.4,E,1.284048,7.008,19.647869,450.315872,0.142701,6.767,2.3
1,92,170,31.83391,75,H,17.1,E,1.291861,3.775,10.51971,72.271882,0.47606,19.146,3.04
2,69,186,19.944502,41,H,14.6,E,1.211214,7.7695,23.092705,372.142895,0.202952,29.047,5.72
3,60,160,23.4375,75,F,13.5,E,1.43979,4.3295,10.825326,89.206177,0.396893,14.943,2.57
4,72,172,24.33748,60,H,14.6,E,1.388885,5.561,14.414152,180.83303,0.260699,13.888,4.36


In [5]:
# Generate design matrix from formula
def generate_input(formula: str, data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Generate design matrix from formula
    """
    y, x = dmatrices(formula, data, return_type='dataframe')
    return y, x

# Generate bootstrap samples from endog, exog dataframes
def generate_bootstrap_samples(endog: pd.DataFrame, exog: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Generate bootstrap samples from endog, exog dataframes
    """
    n = endog.shape[0]
    idx = np.random.randint(0, n, size=n)
    return endog.iloc[idx], exog.iloc[idx]

# Fit generalized linear model
def fit_glm(endog: pd.DataFrame, exog: pd.DataFrame, **kwargs) -> sm.GLM:
    """
    Fit generalized linear model
    """
    return sm.GLM(endog, exog, **kwargs).fit()

# Initialize bootstrap result dictionary from parameter names
def init_output(para_names: List[str], length: int) -> Dict[str, np.array]:
    """
    Initialize bootstrap result dictionary from parameter names
    """
    output = {k: np.zeros(length) for k in para_names}

    return output

# Updating bootstrap result dictionary from GLM results
def update_output(output: Dict[str, np.array], glm: sm.GLM, idx: int) -> Dict[str, np.array]:
    """
    Updating bootstrap result dictionary from GLM results
    """
    for k, v in output.items():
        output[k][idx] = glm.params[k]

    return output

# Calculate bootstrap 95% confidence interval
def calc_ci(output: Dict[str, np.array]) -> Dict[str, List[float]]:
    """
    Calculate bootstrap 95% confidence interval
    """
    ci = {k: np.percentile(output[k], [2.5, 97.5]) for k in output}

    return ci

# Main function

def bootstrap_glm(formula: str, data: pd.DataFrame, n_samples: int,  **kwargs) -> Dict[str, List[float]]:
    """
    Main function
    """
    y, x = generate_input(formula, data)
    output = init_output(x.columns, n_samples)

    for i in range(n_samples):
        endog, exog = generate_bootstrap_samples(y, x)
        glm = fit_glm(endog, exog, **kwargs)
        output = update_output(output, glm, i)

    ci = calc_ci(output)

    return output, ci

In [6]:
res, ci = bootstrap_glm(formula = 'DmCO ~ Surface + Thickness', 
                    data = df, 
                    n_samples = 1000,
                    family=sm.families.Gamma(sm.families.links.log()))

In [8]:
ci

{'Intercept': array([5.05670702, 5.58003033]),
 'Surface': array([0.02262514, 0.03531417]),
 'Thickness': array([-2.25821635, -1.46797055])}