<a href="https://colab.research.google.com/github/joseandresv/Bayes-MCMC-wEmbeddings/blob/main/MCMC%2BEmbedings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import arviz as az
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pymc as pm

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [6]:
data = pd.read_csv("/content/gdrive/MyDrive/MCC/Métodos analíticos /wines_2012.csv", delimiter=',')
data.head()

Unnamed: 0,judge,flight,wine,score,wine.amer,judge.amer
0,Jean-M Cardebat,white,A1,10.0,1,0
1,Jean-M Cardebat,white,B1,13.0,1,0
2,Jean-M Cardebat,white,C1,14.0,0,0
3,Jean-M Cardebat,white,D1,15.0,0,0
4,Jean-M Cardebat,white,E1,8.0,1,0


In [7]:
data['judge_num'] = pd.factorize(data['judge'])[0] + 1
data['wine_num'] = pd.factorize(data['wine'])[0] + 1

# Normalizing the score
mean_score = data['score'].mean()
std_score = data['score'].std()
data['score_est'] = (data['score'] - mean_score) / std_score

# Display the resulting DataFrame
data.head()

Unnamed: 0,judge,flight,wine,score,wine.amer,judge.amer,judge_num,wine_num,score_est
0,Jean-M Cardebat,white,A1,10.0,1,0,1,1,-1.576604
1,Jean-M Cardebat,white,B1,13.0,1,0,1,2,-0.450458
2,Jean-M Cardebat,white,C1,14.0,0,0,1,3,-0.075076
3,Jean-M Cardebat,white,D1,15.0,0,0,1,4,0.300306
4,Jean-M Cardebat,white,E1,8.0,1,0,1,5,-2.327368


In [9]:
n_judges = len(data['judge'].unique())
n_wines = len(data['wine'].unique())
n_scores = len(data)

wine_data_list = {
    'n_judges': n_judges,
    'n_wines': n_wines,
    'N': n_scores,
}

print(wine_data_list)

wine_data_list['S'] = data['score_est'].values
wine_data_list['wine'] = data['wine_num'].values
wine_data_list['judge'] = data['judge_num'].values

{'n_judges': 9, 'n_wines': 20, 'N': 180}


In [10]:
n_judges = wine_data_list['n_judges']
n_wines = wine_data_list['n_wines']
N = wine_data_list['N']
S = wine_data_list['S']
wine = wine_data_list['wine']
judge = wine_data_list['judge']

with pm.Model() as wine_model_1:
    # Parameters
    Q = pm.Normal('Q', mu=0, sigma=1, shape=n_wines)
    sigma = pm.Exponential('sigma', lam=1)

    # Transformed parameters
    media_score = Q[wine-1] # Python is 0-indexed

    # Model
    S_observed = pm.Normal('S', mu=media_score, sigma=sigma, observed=S)

In [11]:
with wine_model_1:
    trace = pm.sample(
        draws=2000,       # Number of sampling iterations
        tune=1000,        # Number of warmup iterations
        chains=4,         # Number of chains
        cores=4,          # Number of parallel chains
        target_accept=0.9 # Adjust acceptance rate to affect step size
    )

  numba_fn = numba.jit(**self.kwargs)(self.function)


In [None]:
az.summary(trace, var_names=["Q", "sigma"])

In [None]:
summary_df = az.summary(trace, var_names=["Q", "sigma"])
summary_df = summary_df[~summary_df.index.str.contains("lp__")]
summary_df = summary_df[['mean', 'sd', 'hdi_3%', 'hdi_97%', 'r_hat', 'ess_bulk', 'ess_tail']]
summary_df.columns = ['mean', 'sd', 'q3', 'q97', 'rhat', 'ess_bulk', 'ess_tail']
summary_df = summary_df.round(5)
print(summary_df)

In [None]:
Q_chain1 = trace.posterior['Q']
az.plot_trace(Q_chain1)
plt.show()

In [None]:
wine_data_list['n_origins'] = len(data['wine.amer'].unique())
wine_data_list['origins'] = data['wine.amer'].values

n_origins = wine_data_list['n_origins']
origin = wine_data_list['origins']

In [None]:
with pm.Model() as wine_model_2:
    # Parameters
    Q = pm.Normal('Q', mu=0, sigma=1, shape=n_wines)
    O = pm.Normal('O', mu=0, sigma=1, shape=n_origins)
    sigma = pm.Exponential('sigma', lam=1)

    # Transformed parameters
    media_score = Q[wine-1] + O[origin-1] # Python is 0-indexed

    # Model
    S_observed = pm.Normal('S', mu=media_score, sigma=sigma, observed=S)

    trace_2 = pm.sample(
        draws=2000,       # Number of sampling iterations
        tune=1000,        # Number of warmup iterations
        chains=4,         # Number of chains
        cores=4,          # Number of parallel chains
        target_accept=0.9 # Adjust acceptance rate to affect step size
    )

    dif_origen = pm.Deterministic('dif_origen', O[0] - O[1])

In [None]:
az.summary(trace_2, var_names=["O", "Q", "sigma"])

In [None]:
summary_wm2 = az.summary(trace_2, var_names=["O", "Q", "sigma"])
summary_wm2 = summary_wm2[~summary_wm2.index.str.contains("lp__")]
summary_wm2 = summary_wm2[['mean', 'sd', 'hdi_3%', 'hdi_97%', 'r_hat', 'ess_bulk', 'ess_tail']]
summary_wm2.columns = ['mean', 'sd', 'q3', 'q97', 'rhat', 'ess_bulk', 'ess_tail']
summary_wm2 = summary_wm2.round(5)
print(summary_wm2)

In [None]:
az.plot_forest(trace_2, var_names=("Q", "O", "sigma"), combined=True)
plt.show()

In [None]:
with pm.Model() as wine_model_3:
    # Parameters
    Q = pm.Normal('Q', mu=0, sigma=1, shape=n_wines)
    O = pm.Normal('O', mu=0, sigma=1, shape=n_origins)
    H = pm.Normal('H', mu=0, sigma=1, shape=n_judges)
    D = pm.HalfNormal('D', sigma=1, shape=n_judges)
    sigma = pm.Exponential('sigma', lam=1)

    # Transformations: media_score
    media_score = (Q[wine-1] + O[origin-1] - H[judge-1]) * D[judge-1]

    # Model
    S_observed = pm.Normal('S', mu=media_score, sigma=sigma, observed=S)

    # Generated quantities
    dif_origen = pm.Deterministic('dif_origen', O[0] - O[1])

    # Sampling
    trace_3 = pm.sample(
        draws=2000,
        tune=1000,
        chains=4,
        cores=4,
        target_accept=0.9
    )

In [None]:
az.summary(trace_3, var_names=["O", "Q", "H", "D", "sigma"])

In [None]:
summary_wm3 = az.summary(trace_3, var_names=["O", "Q", "H", "D", "sigma"])
summary_wm3 = summary_wm3[~summary_wm3.index.str.contains("lp__")]
summary_wm3 = summary_wm3[['mean', 'sd', 'hdi_3%', 'hdi_97%', 'r_hat', 'ess_bulk', 'ess_tail']]
summary_wm3.columns = ['mean', 'sd', 'q3', 'q97', 'rhat', 'ess_bulk', 'ess_tail']
summary_wm3 = summary_wm3.round(5)
print(summary_wm3)

https://huggingface.co/datasets/james-burton/wine_reviews_all_text

https://huggingface.co/datasets/alfredodeza/wine-ratings