### **Code : Microorganisms - Hierarchical Bayesian**
##### Writer : Donghyeon Kim
##### Update : 2025.01.02.

---

#### **0. Package Reference**
```
conda create -c conda-forge -n pymc_env "pymc>=5"
conda activate pymc_env

pip install statsmodels

pip install -U scikit-learn
```

---

#### **1. Prior Settings**

In [1]:
import os

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.formula.api import ols # Regression Model
import pymc as pm # Bayesian Analysis
import arviz as az
from sklearn.cluster import KMeans # K-means Clustering

##### Data : 미생물 본설문 OLS

In [2]:
DATA = pd.read_csv('C:/Users/mazy4/Dropbox/6. C&S Lab/7. 2024 Project/4. 야생생물 경제가치_생물자원관/2. 분석/2. Data/2. 본설문/Microorganisms_Main_Profile(56).csv')
DATA.head(10)

Unnamed: 0,INDEX,CARD,CLA1,CLA2,SCA1,SCA2,SCA3,PUR1,PUR2,PUR3,...,CUL2,CUL3,USA1,USA2,USA3,ADD1,ADD2,ADD3,PRICE,Y
0,24,1,1,0,0,1,0,0,1,0,...,1,0,0,1,0,1,0,0,150000,6
1,24,2,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,200000,9
2,24,3,1,0,0,0,1,0,1,0,...,0,1,1,0,0,0,0,1,100000,1
3,24,4,1,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,1,200000,5
4,24,5,0,1,0,1,0,1,0,0,...,0,1,1,0,0,0,1,0,150000,1
5,24,6,1,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,200000,1
6,24,7,1,0,1,0,0,0,1,0,...,0,1,0,0,1,1,0,0,150000,0
7,24,8,0,1,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,150000,4
8,24,9,1,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,150000,6
9,24,10,0,1,0,0,1,0,0,1,...,0,1,0,0,1,1,0,0,200000,2


In [3]:
# PRICE category = (1, 2, 3)
DATA.loc[(DATA['PRICE'] == 100000),'PRICE'] = 1
DATA.loc[(DATA['PRICE'] == 150000),'PRICE'] = 2
DATA.loc[(DATA['PRICE'] == 200000),'PRICE'] = 3
DATA.head(20)

Unnamed: 0,INDEX,CARD,CLA1,CLA2,SCA1,SCA2,SCA3,PUR1,PUR2,PUR3,...,CUL2,CUL3,USA1,USA2,USA3,ADD1,ADD2,ADD3,PRICE,Y
0,24,1,1,0,0,1,0,0,1,0,...,1,0,0,1,0,1,0,0,2,6
1,24,2,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0,3,9
2,24,3,1,0,0,0,1,0,1,0,...,0,1,1,0,0,0,0,1,1,1
3,24,4,1,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,1,3,5
4,24,5,0,1,0,1,0,1,0,0,...,0,1,1,0,0,0,1,0,2,1
5,24,6,1,0,1,0,0,1,0,0,...,0,1,0,1,0,0,1,0,3,1
6,24,7,1,0,1,0,0,0,1,0,...,0,1,0,0,1,1,0,0,2,0
7,24,8,0,1,1,0,0,0,0,1,...,0,0,0,1,0,0,0,1,2,4
8,24,9,1,0,0,0,1,1,0,0,...,0,0,0,0,1,0,0,1,2,6
9,24,10,0,1,0,0,1,0,0,1,...,0,1,0,0,1,1,0,0,3,2


---

#### **2. 전체 응답자 OLS**

In [4]:
def get_ols_params_table(data_df):
    ols_model = ols("Y ~ CLA2 + SCA2 + SCA3 + PUR2 + PUR3 + CUL2 + CUL3 + USA2 + USA3 + ADD2 + ADD3 + PRICE", data=data_df)
    ols_result = ols_model.fit()
    ols_params = pd.DataFrame(ols_result.params, columns=['Coefficient'])
    ols_params['Standard error'] = ols_result.bse
    ols_params['P-value'] = ols_result.pvalues.round(6)
    
    ols_params_2 = ols_params[['Coefficient', 'Standard error']].copy()
    add_params = [
        -ols_params.loc['CLA2',['Coefficient', 'Standard error']],
        -ols_params.loc['SCA2',['Coefficient', 'Standard error']] -ols_params.loc['SCA3',['Coefficient', 'Standard error']],
        -ols_params.loc['PUR2',['Coefficient', 'Standard error']] -ols_params.loc['PUR3',['Coefficient', 'Standard error']],
        -ols_params.loc['CUL2',['Coefficient', 'Standard error']] -ols_params.loc['CUL3',['Coefficient', 'Standard error']],
        -ols_params.loc['USA2',['Coefficient', 'Standard error']] -ols_params.loc['USA3',['Coefficient', 'Standard error']],
        -ols_params.loc['ADD2',['Coefficient', 'Standard error']] -ols_params.loc['ADD3',['Coefficient', 'Standard error']],
        2*ols_params.loc['PRICE',['Coefficient', 'Standard error']],
        3*ols_params.loc['PRICE',['Coefficient', 'Standard error']]
        ]
    add_params = pd.DataFrame(add_params, columns=['Coefficient', 'Standard error'],
                              index=['CLA1','SCA1','PUR1','CUL1','USA1','ADD1','PRICE_15','PRICE_20'])
    ols_params_2 = pd.concat([ols_params_2, add_params], axis=0)
    ols_params_2.rename(index={'PRICE':'PRICE_10'}, inplace=True)
    ols_params_2 = ols_params_2.loc[['CLA1','CLA2',
                                     'SCA1','SCA2','SCA3',
                                     'PUR1','PUR2','PUR3',
                                     'CUL1','CUL2','CUL3',
                                     'USA1','USA2','USA3',
                                     'ADD1','ADD2','ADD3',
                                     'PRICE_10','PRICE_15','PRICE_20',
                                     'Intercept'],:]
    
    return ols_params, ols_params_2

In [5]:
total_ols, total_ols_2 = get_ols_params_table(DATA)

# Directory
folder_root = 'C:/Users/mazy4/Dropbox/6. C&S Lab/7. 2024 Project/4. 야생생물 경제가치_생물자원관/2. 분석/5. Result/2. 본설문_Result/1. 미생물/HB'
if not os.path.isdir(folder_root):
    os.makedirs(folder_root)

# Data Frame -> csv file
total_ols_file_name = folder_root + '/' + '1_Microorganisms_TOTAL_OLS.csv'
total_ols.to_csv(total_ols_file_name, mode='w')

# Result Check
total_ols

Unnamed: 0,Coefficient,Standard error,P-value
Intercept,3.905754,0.328922,0.0
CLA2,-0.014881,0.166794,0.928927
SCA2,0.235119,0.192597,0.222457
SCA3,0.767857,0.192597,7.2e-05
PUR2,0.169643,0.192597,0.378629
PUR3,-0.33631,0.192597,0.081088
CUL2,-0.642857,0.192597,0.000876
CUL3,-1.08631,0.192597,0.0
USA2,1.008929,0.192597,0.0
USA3,1.199405,0.192597,0.0


---

#### **3. 개별 응답자의 부분 가치**

In [6]:
X = DATA[['CLA2',
          'SCA2','SCA3',
          'PUR2','PUR3',
          'CUL2','CUL3',
          'USA2','USA3',
          'ADD2','ADD3',
          'PRICE']].to_numpy().astype(np.float64)
y = DATA['Y'].to_numpy().astype(np.float64)

print(X.shape)
print(y.shape)

(1008, 12)
(1008,)


In [7]:
K = len(DATA['INDEX'].unique()) # 그룹 개수 : 개별 응답자 수
G = np.array([i for i in range(56) for _ in range(18)])

print(K)
print(G)

56
[ 0  0  0 ... 55 55 55]


In [8]:
with pm.Model() as HLM:
    
    # Prior probability (proposal distribution) - level2
    mu_a = pm.Normal('mu_1', mu=0, sigma=100)
    sigma_a = pm.HalfCauchy('sigma_1', 5)
    mu_b = pm.Normal('mu_2', mu=0, sigma=100)
    sigma_b = pm.HalfCauchy('sigma_2', 5)

    # Prior probability (proposal distribution)
    a = [pm.Normal(f'theta_Q{i}', mu=mu_a, sigma=sigma_a, shape=K) for i in range(1, X.shape[1]+1)]
    b = pm.Normal('intercept', mu=mu_b, sigma=sigma_b, shape=K)
    eps = pm.HalfCauchy('eps', 5)

    # Model
    y_est = b[G]
    for i in range(len(a)):
        y_est = y_est + a[i][G]*X[:,i]
    likelihood = pm.Normal('y', mu=y_est, sigma=eps, observed=y)

In [15]:
with HLM:
    HLM_trace = pm.sample(10000, progressbar=True, chains=10, cores=12, random_seed=123, target_accept=0.999)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (10 chains in 10 jobs)
NUTS: [mu_1, sigma_1, mu_2, sigma_2, theta_Q1, theta_Q2, theta_Q3, theta_Q4, theta_Q5, theta_Q6, theta_Q7, theta_Q8, theta_Q9, theta_Q10, theta_Q11, theta_Q12, intercept, eps]


Output()

Sampling 10 chains for 1_000 tune and 10_000 draw iterations (10_000 + 100_000 draws total) took 793 seconds.


In [16]:
person_index = DATA['INDEX'].unique()
person_index.sort()

personal_params = az.summary(HLM_trace).iloc[1:,[0]]
personal_params['params'] = personal_params.index.str.replace('\[[0-9]{,3}\]', '', regex=True)

# mu, sigma 제거
personal_params = personal_params[~personal_params['params'].isin(['mu_1', 'mu_2', 'sigma_1', 'sigma_2'])].copy()
personal_params['index'] = [
    i for param in personal_params['params'].unique() 
    if param not in ['mu_1', 'mu_2', 'sigma_1', 'sigma_2']
    for i in person_index
]

personal_params = personal_params.pivot(columns='params', index='index', values='mean').reset_index()
personal_params.columns = ['INDEX',
                           'Intercept',
                           'CLA2',
                           'SCA2','SCA3',
                           'PUR2','PUR3',
                           'CUL2','CUL3',
                           'USA2','USA3',
                           'ADD2','ADD3',
                           'PRICE']

# Output: csv file 저장
personal_params_file_name = folder_root + '/' + '2_Microorganisms_HBM_Personal_params.csv'
personal_params.to_csv(personal_params_file_name, mode='w', index=False)

# Result 확인
personal_params

Unnamed: 0,INDEX,Intercept,CLA2,SCA2,SCA3,PUR2,PUR3,CUL2,CUL3,USA2,USA3,ADD2,ADD3,PRICE
0,24,3.762,0.032,0.093,0.043,0.071,0.118,0.269,0.191,-0.058,0.32,-0.827,0.319,0.07
1,27,3.27,0.351,0.531,-0.37,0.05,-0.017,-0.267,0.658,-0.241,0.244,0.043,1.109,-0.239
2,31,4.425,0.11,0.192,0.192,0.057,0.131,0.183,0.296,0.147,0.14,0.241,0.084,0.234
3,33,3.824,0.15,0.068,-0.181,-0.06,0.155,0.056,-0.098,0.304,0.204,0.005,0.082,0.382
4,34,3.022,0.151,0.149,-0.002,-0.125,0.028,-0.072,0.058,0.156,0.05,0.101,0.215,0.064
5,36,3.447,0.126,0.177,-0.021,-0.269,0.13,0.028,0.022,0.07,-0.034,0.064,0.079,0.079
6,38,3.83,0.11,0.06,0.062,0.163,0.125,0.125,-0.14,0.262,0.19,0.189,0.111,0.01
7,40,4.271,0.066,0.214,0.313,0.092,0.05,0.349,0.134,0.134,0.192,0.142,0.013,0.514
8,51,4.276,-0.01,0.149,0.4,-0.058,0.158,0.457,0.607,-0.44,0.251,-0.149,0.021,0.72
9,52,3.625,-0.073,0.294,0.243,-0.046,-0.009,0.543,0.148,1.348,-0.114,0.135,0.194,-0.107


---

#### **4. Clustering**

In [17]:
kmeans = KMeans(n_clusters=2, random_state=456)
clusters = kmeans.fit(personal_params.iloc[:,1:])

personal_params['Cluster'] = clusters.labels_
DATA_cluster = pd.merge(left=DATA,
                        right=personal_params[['INDEX','Cluster']],
                        how='left',
                        on='INDEX')

# Output : csv file
data_cluster_file_name = folder_root + '/' + '3_Microorganisms_OLS_Data_Cluster.csv'
DATA_cluster.to_csv(data_cluster_file_name, mode='w', index=False)

# Result Check
DATA_cluster

Unnamed: 0,INDEX,CARD,CLA1,CLA2,SCA1,SCA2,SCA3,PUR1,PUR2,PUR3,...,CUL3,USA1,USA2,USA3,ADD1,ADD2,ADD3,PRICE,Y,Cluster
0,24,1,1,0,0,1,0,0,1,0,...,0,0,1,0,1,0,0,2,6,0
1,24,2,1,0,0,0,1,0,1,0,...,0,0,1,0,0,1,0,3,9,0
2,24,3,1,0,0,0,1,0,1,0,...,1,1,0,0,0,0,1,1,1,0
3,24,4,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,1,3,5,0
4,24,5,0,1,0,1,0,1,0,0,...,1,1,0,0,0,1,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1003,196,14,1,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,2,4,0
1004,196,15,0,1,0,1,0,0,1,0,...,0,0,0,1,0,1,0,1,5,0
1005,196,16,1,0,1,0,0,1,0,0,...,0,1,0,0,1,0,0,1,4,0
1006,196,17,0,1,1,0,0,0,1,0,...,0,1,0,0,0,0,1,3,4,0


##### 1st Group

In [18]:
idx_1 = (DATA_cluster['Cluster'] == 0)
group1_ols, group1_ols_2 = get_ols_params_table(DATA_cluster.loc[idx_1,:])
group1_ols

Unnamed: 0,Coefficient,Standard error,P-value
Intercept,3.036458,0.39748,0.0
CLA2,-0.03125,0.201559,0.876844
SCA2,0.276042,0.23274,0.236103
SCA3,0.833333,0.23274,0.000373
PUR2,0.078125,0.23274,0.737241
PUR3,-0.46875,0.23274,0.04448
CUL2,-0.713542,0.23274,0.002275
CUL3,-1.005208,0.23274,1.9e-05
USA2,0.895833,0.23274,0.000132
USA3,1.182292,0.23274,1e-06


##### 2nd Group

In [19]:
idx_2 = (DATA_cluster['Cluster'] == 1)
group2_ols, group2_ols_2 = get_ols_params_table(DATA_cluster.loc[idx_2,:])
group2_ols

Unnamed: 0,Coefficient,Standard error,P-value
Intercept,5.064815,0.423956,0.0
CLA2,0.006944,0.214985,0.974247
SCA2,0.180556,0.248243,0.467428
SCA3,0.680556,0.248243,0.006378
PUR2,0.291667,0.248243,0.240692
PUR3,-0.159722,0.248243,0.52031
CUL2,-0.548611,0.248243,0.027647
CUL3,-1.194444,0.248243,2e-06
USA2,1.159722,0.248243,4e-06
USA3,1.222222,0.248243,1e-06


---

#### **5. Result Clean-Up**

In [20]:
summary_coef = pd.concat([total_ols_2, group1_ols_2, group2_ols_2], axis=1)
summary_coef.columns = ['Full sample', 'Full sample SE', 'Cluster 1', 'Cluster 1 SE', 'Cluster 2', 'Cluster 2 SE']
add_lines = pd.DataFrame([[len(DATA['INDEX'].unique()),
                           len(DATA_cluster.loc[idx_1,'INDEX'].unique()),
                           len(DATA_cluster.loc[idx_2,'INDEX'].unique())]],
                         columns=['Full sample', 'Cluster 1', 'Cluster 2'], index=['Number of cases'])
summary_coef = pd.concat([summary_coef, add_lines], axis=0)

# Output : csv file
summary_coef_file_name = folder_root + '/' + '4_Microorganisms_Summary_Coef.csv'
summary_coef.to_csv(summary_coef_file_name, mode='w')

# Result Check
summary_coef

Unnamed: 0,Full sample,Full sample SE,Cluster 1,Cluster 1 SE,Cluster 2,Cluster 2 SE
CLA1,0.014881,-0.166794,0.03125,-0.201559,-0.006944,-0.214985
CLA2,-0.014881,0.166794,-0.03125,0.201559,0.006944,0.214985
SCA1,-1.002976,-0.385194,-1.109375,-0.465481,-0.861111,-0.496486
SCA2,0.235119,0.192597,0.276042,0.23274,0.180556,0.248243
SCA3,0.767857,0.192597,0.833333,0.23274,0.680556,0.248243
PUR1,0.166667,-0.385194,0.390625,-0.465481,-0.131944,-0.496486
PUR2,0.169643,0.192597,0.078125,0.23274,0.291667,0.248243
PUR3,-0.33631,0.192597,-0.46875,0.23274,-0.159722,0.248243
CUL1,1.729167,-0.385194,1.71875,-0.465481,1.743056,-0.496486
CUL2,-0.642857,0.192597,-0.713542,0.23274,-0.548611,0.248243
