### **Code : 천연물 본설문 - Hierarchical Bayesian**
##### Writer : Donghyeon Kim
##### Update : 2023.10.20.

#### **0. Package Reference**
```
conda create -c conda-forge -n pymc_env "pymc>=5"
conda activate pymc_env

pip install statsmodels

pip install -U scikit-learn
```

#### **1. Prior Settings**

In [1]:
import os

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsmodels.formula.api import ols # Regression Model
import pymc as pm # Bayesian Analysis
import arviz as az
from sklearn.cluster import KMeans # K-means Clustering

##### Data : 천연물 본설문 OLS

In [2]:
DATA = pd.read_csv('C:/Users/cgpar/Downloads/천연물 HB/천연물_본설문_OLS_Data.csv')
DATA.head(10)

Unnamed: 0,INDEX,CARD,OAU1,OAU2,OAU3,MTY1,MTY2,EIN1,EIN2,EIN3,...,SIN2,SIN3,EFI1,EFI2,EFI3,AFI1,AFI2,AFI3,PRICE,Y
0,1,1,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,2000000,5
1,1,2,0,1,0,0,1,0,0,1,...,0,1,1,0,0,0,1,0,30000,5
2,1,3,0,1,0,0,1,0,1,0,...,1,0,0,0,1,1,0,0,300000,4
3,1,4,1,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,30000,7
4,1,5,0,0,1,1,0,0,0,1,...,1,0,1,0,0,0,1,0,300000,4
5,1,6,0,0,1,1,0,1,0,0,...,0,0,0,0,1,0,1,0,300000,7
6,1,7,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,30000,6
7,1,8,0,1,0,1,0,1,0,0,...,0,1,0,1,0,0,0,1,300000,5
8,1,9,0,0,1,0,1,0,1,0,...,0,0,1,0,0,0,0,1,2000000,7
9,1,10,0,1,0,1,0,0,0,1,...,0,0,0,0,1,0,0,1,2000000,7


In [3]:
# PRICE category = (1, 2, 3)
DATA.loc[(DATA['PRICE'] == 30000),'PRICE'] = 1
DATA.loc[(DATA['PRICE'] == 300000),'PRICE'] = 2
DATA.loc[(DATA['PRICE'] == 2000000),'PRICE'] = 3
DATA.head(20)

Unnamed: 0,INDEX,CARD,OAU1,OAU2,OAU3,MTY1,MTY2,EIN1,EIN2,EIN3,...,SIN2,SIN3,EFI1,EFI2,EFI3,AFI1,AFI2,AFI3,PRICE,Y
0,1,1,1,0,0,1,0,0,1,0,...,1,0,0,1,0,0,1,0,3,5
1,1,2,0,1,0,0,1,0,0,1,...,0,1,1,0,0,0,1,0,1,5
2,1,3,0,1,0,0,1,0,1,0,...,1,0,0,0,1,1,0,0,2,4
3,1,4,1,0,0,1,0,1,0,0,...,0,0,1,0,0,1,0,0,1,7
4,1,5,0,0,1,1,0,0,0,1,...,1,0,1,0,0,0,1,0,2,4
5,1,6,0,0,1,1,0,1,0,0,...,0,0,0,0,1,0,1,0,2,7
6,1,7,0,1,0,1,0,0,1,0,...,0,0,0,1,0,0,1,0,1,6
7,1,8,0,1,0,1,0,1,0,0,...,0,1,0,1,0,0,0,1,2,5
8,1,9,0,0,1,0,1,0,1,0,...,0,0,1,0,0,0,0,1,3,7
9,1,10,0,1,0,1,0,0,0,1,...,0,0,0,0,1,0,0,1,3,7


#### **2. 전체 응답자 OLS**

In [4]:
def get_ols_params_table(data_df):
    ols_model = ols("Y ~ OAU2 + OAU3 + MTY2 + EIN1 + EIN3 + SIN2 + SIN3 + EFI1 + EFI3 + AFI1 + AFI3 + PRICE", data=data_df)
    ols_result = ols_model.fit()
    ols_params = pd.DataFrame(ols_result.params, columns=['Coefficient'])
    ols_params['Standard error'] = ols_result.bse
    ols_params['P-value'] = ols_result.pvalues.round(6)
    
    ols_params_2 = ols_params[['Coefficient']].copy()
    add_params = [
        -ols_params.loc['OAU2','Coefficient'] -ols_params.loc['OAU3','Coefficient'],
        -ols_params.loc['MTY2','Coefficient'],
        -ols_params.loc['EIN1','Coefficient'] -ols_params.loc['EIN3','Coefficient'],
        -ols_params.loc['SIN2','Coefficient'] -ols_params.loc['SIN3','Coefficient'],
        -ols_params.loc['EFI1','Coefficient'] -ols_params.loc['EFI3','Coefficient'],
        -ols_params.loc['AFI1','Coefficient'] -ols_params.loc['AFI3','Coefficient'],
        2*ols_params.loc['PRICE','Coefficient'],
        3*ols_params.loc['PRICE','Coefficient']
        ]
    add_params = pd.DataFrame(add_params, columns=['Coefficient'],
                              index=['OAU1','MTY1','EIN2','SIN1','EFI2','AFI2','PRICE_30','PRICE_200'])
    ols_params_2 = pd.concat([ols_params_2, add_params], axis=0)
    ols_params_2.rename(index={'PRICE':'PRICE_03'}, inplace=True)
    ols_params_2 = ols_params_2.loc[['OAU1','OAU2','OAU3',
                                     'MTY1','MTY2',
                                     'EIN1','EIN2','EIN3',
                                     'SIN1','SIN2','SIN3',
                                     'EFI1','EFI2','EFI3',
                                     'AFI1','AFI2','AFI3',
                                     'PRICE_03','PRICE_30','PRICE_200',
                                     'Intercept'],:]
    
    return ols_params, ols_params_2

In [5]:
total_ols, total_ols_2 = get_ols_params_table(DATA)

# Directory
folder_root = 'C:/Users/cgpar/Downloads/천연물 HB/Result'
if not os.path.isdir(folder_root):
    os.makedirs(folder_root)

# Data Frame -> csv file
total_ols_file_name = folder_root + '/' + '천연물_TOTAL_OLS.csv'
total_ols.to_csv(total_ols_file_name, mode='w')

# Result Check
total_ols

Unnamed: 0,Coefficient,Standard error,P-value
Intercept,4.97321,0.163555,0.0
OAU2,0.88274,0.095768,0.0
OAU3,1.001318,0.095768,0.0
MTY2,1.605402,0.082937,0.0
EIN1,-0.305007,0.095768,0.001458
EIN3,0.141634,0.095768,0.139229
SIN2,0.671937,0.095768,0.0
SIN3,0.613307,0.095768,0.0
EFI1,-0.237813,0.095768,0.013056
EFI3,0.106061,0.095768,0.268147


#### **3. 개별 응답자의 부분 가치**

In [6]:
X = DATA[['OAU2','OAU3',
          'MTY2',
          'EIN1','EIN3',
          'SIN2','SIN3',
          'EFI1','EFI3',
          'AFI2','AFI3',
          'PRICE']].to_numpy().astype(np.float64)
y = DATA['Y'].to_numpy().astype(np.float64)

print(X.shape)
print(y.shape)

(4554, 12)
(4554,)


In [7]:
K = len(DATA['INDEX'].unique()) # 그룹 개수 : 개별 응답자 수
G = np.array([i for i in range(253) for _ in range(18)])

print(K)
print(G)

253
[  0   0   0 ... 252 252 252]


In [8]:
with pm.Model() as HLM:
    
    # Prior probability (proposal distribution) - level2
    mu_a = pm.Normal('mu_1', mu=0, sigma=100)
    sigma_a = pm.HalfCauchy('sigma_1', 5)
    mu_b = pm.Normal('mu_2', mu=0, sigma=100)
    sigma_b = pm.HalfCauchy('sigma_2', 5)

    # Prior probability (proposal distribution)
    a = [pm.Normal(f'theta_Q{i}', mu=mu_a, sigma=sigma_a, shape=K) for i in range(1, X.shape[1]+1)]
    b = pm.Normal('intercept', mu=mu_b, sigma=sigma_b, shape=K)
    eps = pm.HalfCauchy('eps', 5)

    # Model
    y_est = b[G]
    for i in range(len(a)):
        y_est = y_est + a[i][G]*X[:,i]
    likelihood = pm.Normal('y', mu=y_est, sigma=eps, observed=y)

In [9]:
with HLM:
    HLM_trace = pm.sample(1000, progressbar=True, chains=2, cores=12, random_seed=123)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 12 jobs)
NUTS: [mu_1, sigma_1, mu_2, sigma_2, theta_Q1, theta_Q2, theta_Q3, theta_Q4, theta_Q5, theta_Q6, theta_Q7, theta_Q8, theta_Q9, theta_Q10, theta_Q11, theta_Q12, intercept, eps]


Sampling 2 chains for 1_000 tune and 1_000 draw iterations (2_000 + 2_000 draws total) took 91 seconds.
We recommend running at least 4 chains for robust computation of convergence diagnostics


In [10]:
person_index = DATA['INDEX'].unique()
person_index.sort()

personal_params = az.summary(HLM_trace).iloc[2:-3,[0]]
personal_params['params'] = personal_params.index.str.replace('\[[0-9]{,3}\]', '', regex=True)
personal_params['index'] = [i for _ in personal_params['params'].unique() for i in person_index]

personal_params = personal_params.pivot(columns='params', index='index', values='mean').reset_index()
personal_params.columns = ['INDEX',
                           'Intercept',
                           'OAU2','OAU3',
                           'MTY2',
                           'EIN1','EIN3',
                           'SIN2','SIN3',
                           'EFI1','EFI3',
                           'AFI2','AFI3',
                           'PRICE']

# Output : csv file
personal_params_file_name = folder_root + '/' + '천연물_HBM_Personal_params.csv'
personal_params.to_csv(personal_params_file_name, mode='w', index=False)

# Result Check
personal_params

Unnamed: 0,INDEX,Intercept,OAU2,OAU3,MTY2,EIN1,EIN3,SIN2,SIN3,EFI1,EFI3,AFI2,AFI3,PRICE
0,1,6.111,-0.138,-0.357,0.573,-0.144,0.358,-0.086,0.218,-0.033,-0.497,-0.381,0.279,0.165
1,2,9.464,0.225,0.153,-0.772,-3.041,-1.071,-0.675,-0.961,0.575,-0.657,0.516,1.245,0.783
2,3,3.706,1.404,-0.209,0.291,-0.759,1.085,0.791,-0.206,0.744,0.658,1.599,-0.088,0.607
3,5,7.470,0.968,0.948,-0.123,-2.381,-1.020,0.009,0.613,0.249,-1.433,0.075,0.793,-1.049
4,6,4.991,0.408,-0.288,-0.289,0.259,0.059,-0.501,0.199,-0.596,0.061,-0.176,-0.020,0.113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248,492,6.259,0.652,0.891,1.810,-1.576,2.047,0.014,-0.098,-0.458,-0.187,0.724,-1.398,1.871
249,493,5.452,1.887,0.970,0.889,-1.759,0.828,1.332,-0.906,0.130,1.120,0.765,0.111,0.018
250,495,4.130,0.719,0.597,0.719,-0.182,0.608,0.226,-0.737,0.299,0.089,1.008,-0.238,0.911
251,498,3.413,0.340,-0.084,0.052,-0.635,-0.133,0.216,-1.177,0.475,0.200,0.429,0.163,0.720


#### **4. Clustering**

In [11]:
kmeans = KMeans(n_clusters=2, random_state=456)
clusters = kmeans.fit(personal_params.iloc[:,1:])

personal_params['Cluster'] = clusters.labels_
DATA_cluster = pd.merge(left=DATA,
                        right=personal_params[['INDEX','Cluster']],
                        how='left',
                        on='INDEX')

# Output : csv file
data_cluster_file_name = folder_root + '/' + '천연물_본설문_OLS_Data_Cluster.csv'
DATA_cluster.to_csv(data_cluster_file_name, mode='w', index=False)

# Result Check
DATA_cluster

Unnamed: 0,INDEX,CARD,OAU1,OAU2,OAU3,MTY1,MTY2,EIN1,EIN2,EIN3,...,SIN3,EFI1,EFI2,EFI3,AFI1,AFI2,AFI3,PRICE,Y,Cluster
0,1,1,1,0,0,1,0,0,1,0,...,0,0,1,0,0,1,0,3,5,1
1,1,2,0,1,0,0,1,0,0,1,...,1,1,0,0,0,1,0,1,5,1
2,1,3,0,1,0,0,1,0,1,0,...,0,0,0,1,1,0,0,2,4,1
3,1,4,1,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,1,7,1
4,1,5,0,0,1,1,0,0,0,1,...,0,1,0,0,0,1,0,2,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4549,499,14,0,0,1,1,0,0,0,1,...,1,0,1,0,1,0,0,3,8,0
4550,499,15,1,0,0,1,0,0,1,0,...,1,1,0,0,0,0,1,2,3,0
4551,499,16,1,0,0,0,1,0,0,1,...,0,0,1,0,1,0,0,2,4,0
4552,499,17,0,0,1,1,0,0,1,0,...,1,0,0,1,1,0,0,1,9,0


##### 1st Group

In [12]:
idx_1 = (DATA_cluster['Cluster'] == 0)
group1_ols, group1_ols_2 = get_ols_params_table(DATA_cluster.loc[idx_1,:])
group1_ols

Unnamed: 0,Coefficient,Standard error,P-value
Intercept,2.725017,0.185343,0.0
OAU2,0.92348,0.108526,0.0
OAU3,1.106918,0.108526,0.0
MTY2,2.212788,0.093986,0.0
EIN1,-0.265199,0.108526,0.0146
EIN3,0.176101,0.108526,0.104773
SIN2,0.812369,0.108526,0.0
SIN3,0.642558,0.108526,0.0
EFI1,-0.238994,0.108526,0.027732
EFI3,0.102725,0.108526,0.343947


##### 2nd Group

In [13]:
idx_2 = (DATA_cluster['Cluster'] == 1)
group2_ols, group2_ols_2 = get_ols_params_table(DATA_cluster.loc[idx_2,:])
group2_ols

Unnamed: 0,Coefficient,Standard error,P-value
Intercept,8.776005,0.247146,0.0
OAU2,0.81383,0.144714,0.0
OAU3,0.822695,0.144714,0.0
MTY2,0.578014,0.125326,4e-06
EIN1,-0.37234,0.144714,0.010169
EIN3,0.083333,0.144714,0.564795
SIN2,0.434397,0.144714,0.002724
SIN3,0.56383,0.144714,0.000102
EFI1,-0.235816,0.144714,0.103389
EFI3,0.111702,0.144714,0.440293


#### **5. Result Clean-Up**

In [14]:
summary_coef = pd.concat([total_ols_2, group1_ols_2, group2_ols_2], axis=1)
summary_coef.columns = ['Full sample', 'Cluster 1', 'Cluster 2']
add_lines = pd.DataFrame([[len(DATA['INDEX'].unique()),
                           len(DATA_cluster.loc[idx_1,'INDEX'].unique()),
                           len(DATA_cluster.loc[idx_2,'INDEX'].unique())]],
                         columns=['Full sample', 'Cluster 1', 'Cluster 2'], index=['Number of cases'])
summary_coef = pd.concat([summary_coef, add_lines], axis=0)

# Output : csv file
summary_coef_file_name = folder_root + '/' + '천연물_Summary_Coef.csv'
summary_coef.to_csv(summary_coef_file_name, mode='w')

# Result Check
summary_coef

Unnamed: 0,Full sample,Cluster 1,Cluster 2
OAU1,-1.884058,-2.030398,-1.636525
OAU2,0.88274,0.92348,0.81383
OAU3,1.001318,1.106918,0.822695
MTY1,-1.605402,-2.212788,-0.578014
MTY2,1.605402,2.212788,0.578014
EIN1,-0.305007,-0.265199,-0.37234
EIN2,0.163373,0.089099,0.289007
EIN3,0.141634,0.176101,0.083333
SIN1,-1.285244,-1.454927,-0.998227
SIN2,0.671937,0.812369,0.434397
