# Formación de sintagmas

*********************************************

Variables:

$w=[w_1, ..., w_n] \rightarrow$ El sintagma

* $e_t \in \{0,1\} \rightarrow$ Ocurrencia del evento en el momento $t$
* $s_t \in \{0,1\} \rightarrow$ Existencia de la situación en el momento $t$
* $s_{t-1} \rightarrow$ Situación en el momento $t-1$
* $w_t^i \rightarrow$ El evento del sintagma siendo formado en el momento $t$:
    * $i=0$: hipótesis de que el sintagma no se ha formado aún
    * $i=1$: hipótesis de que el sintagma se ha formado
* $\pi_t \rightarrow$ Observación


*********************************************

* $\alpha_j^i(t)=P[w_t=i|s_t=j]$

* $\beta_{kh}^j(t)=P[s_t=j|s_{t-1}=k,e_t=h]$

* $\nu^k(t)=P[s_{t-1}=k]$

* $\gamma^h(t)=P[e_t=h]$

* $\tau_i^{jkh}=P(s_t=j,s_{t-1}=k, e_t=h|w=i)$

$i, j, k, h \in \{0,1\}$

*********************************************

Consideraciones previas:

- La situación no se ha dado en $t=0$
    - $\nu^0(t)=1$
    - $\nu^1(t)=0$

- $\psi$: lista que, para cada t, contiene la probabilidad de que el evento que causa el sintagma se genere

- $\alpha, \beta, \nu, \gamma$ son listas de:

    - n listas, siendo n el subíndice:
        - cada lista de m posiciones según el superíndice.



- $\alpha$: [[ , ], [ , ]]
- $\beta$: [[ [ , ] , [ , ] ], [ [ , ] , [ , ] ]]
- $\nu$: una lista de dos posiciones
- $\gamma$: una lista de dos posiciones

- $\tau$: ocho listas de dos posiciones

In [None]:
import json
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import random

EXT = ".json"

MONTHS = ["Enero", "Febrero", "Marzo", "Abril", "Mayo", "Junio", "Julio", "Agosto", "Septiembre", "Octubre", "Noviembre", "Diciembre"]
ALL_MONTHS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
HALF_MONTHS = [1, 3, 5, 7, 9, 11]
LAST_DECADE = [2019, 2020, 2021, 2022, 2023]

DATA_PATH = "data/"

# Elementos importantes de los articulo
RESPONSE = "response"
DOCS = "docs"
DATA = "data"

ABSTRACT = "abstract"
SNIPPET = "snippet"
LEAD_PAR = "lead_paragraph"
PUB_DATE = "pub_date"

# Para la grafica
YEAR = "year"
MONTH = "month"
FREQ_ALL_SYNTAGM = "Complete Syntagm"
FREQ_SUM_SYNTAGMS = "Divided Syntagm"
FREQ = "frequency"
DATE = "date"

ZERO_ONE = (0, 1)

In [None]:
def plotOnlyCompleteSyntagm(data, syntagm, interval, label):

    if FREQ_ALL_SYNTAGM not in data:
        print(f"[ERROR plotOnlyCompleteSyntagm] Missing some flag {FREQ_ALL_SYNTAGM}")
        return

    # Crear un DataFrame
    df = pd.DataFrame(data)

    # Crear una columna de fecha combinando año y mes
    df[PUB_DATE] = pd.to_datetime(df[[YEAR, MONTH]].assign(day=1))

    # Establecer PUB_DATE como el índice del DataFrame
    df.set_index(PUB_DATE, inplace=True)

    # Crear la gráfica
    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df[FREQ_ALL_SYNTAGM], marker='o', label=label)

    # Formatear el eje X para mostrar año y mes
    plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))
    plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator(interval=interval))  # Etiquetas cada 6 meses

    # Rotar las etiquetas del eje X para mejor legibilidad
    plt.gcf().autofmt_xdate(rotation=45)

    # Añadir etiquetas y título
    plt.xlabel(DATE)
    plt.ylabel(FREQ)
    plt.title(syntagm)

    # Mostrar la gráfica
    plt.legend()
    plt.grid(True)
    plt.show()


def plotAllData(data, syntagm, interval):

    if FREQ_ALL_SYNTAGM not in data and FREQ_SUM_SYNTAGMS not in data:
        print("[ERROR plotAllData] Missing some flag")
        return

    # Crear un DataFrame
    df = pd.DataFrame(data)

    # Crear una columna de fecha combinando año y mes
    df[PUB_DATE] = pd.to_datetime(df[[YEAR, MONTH]].assign(day=1))

    # Establecer PUB_DATE como el índice del DataFrame
    df.set_index(PUB_DATE, inplace=True)

    # Crear la gráfica
    plt.figure(figsize=(10, 6))
    plt.plot(df.index, df[FREQ_ALL_SYNTAGM], marker='o', label=FREQ_ALL_SYNTAGM)
    plt.plot(df.index, df[FREQ_SUM_SYNTAGMS], marker='o', label=FREQ_SUM_SYNTAGMS)

    # Formatear el eje X para mostrar año y mes
    plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))
    plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator(interval=interval))  # Etiquetas cada 6 meses

    # Rotar las etiquetas del eje X para mejor legibilidad
    plt.gcf().autofmt_xdate(rotation=45)

    # Añadir etiquetas y título
    plt.xlabel(DATE)
    plt.ylabel(FREQ)
    plt.title(syntagm)

    # Mostrar la gráfica
    plt.legend()
    plt.grid(True)
    plt.show()


def plotAllDataTwoFigs(data, syntagm, interval):

    if FREQ_ALL_SYNTAGM not in data and FREQ_SUM_SYNTAGMS not in data:
        print("[ERROR plotAllDataTwoFigs] Missing some flag")
        return

    # Crear un DataFrame
    df = pd.DataFrame(data)

    # Crear una columna de fecha combinando año y mes
    df[PUB_DATE] = pd.to_datetime(df[[YEAR, MONTH]].assign(day=1))

    # Establecer PUB_DATE como el índice del DataFrame
    df.set_index(PUB_DATE, inplace=True)

    figure, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    # Crear las gráficas
    ax1.plot(df.index, df[FREQ_ALL_SYNTAGM], marker='o', label=FREQ_ALL_SYNTAGM, color='blue')
    ax1.set_xlabel(DATE)
    ax1.set_ylabel(FREQ)
    ax1.set_title(syntagm)
    ax1.legend()

    ax2.plot(df.index, df[FREQ_SUM_SYNTAGMS], marker='o', label=FREQ_SUM_SYNTAGMS, color='orange')
    ax2.set_xlabel(DATE)
    ax2.set_ylabel(FREQ)
    ax2.set_title(syntagm)
    ax2.legend()

    # Formatear el eje X para mostrar año y mes
    ax1.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))
    ax1.xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator(interval=interval))
    
    ax2.xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m'))
    ax2.xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator(interval=interval))

    # Rotar las etiquetas del eje X para mejor legibilidad
    plt.gcf().autofmt_xdate(rotation=45)

    # Mostrar la gráfica
    plt.show()

In [None]:

# Normaliza pi en un instante t
def normPi(pi: np.ndarray):
    if sum(pi) == 0:
        return np.array([0, 0])
    return pi / sum(pi)


def computePi(year: int, month: int, syntagm: str):

    # contador para las apariciones del sintagma completo
    count_syntagm = 0

    # diccionario contador para las apariciones de cada una de las palabras
    split = syntagm.split(" ")
    count_individual_words = dict()

    if len(split) > 1:
        for word in split:
            count_individual_words[word] = 0

    # ruta al json de datos
    path = DATA_PATH + str(year) + "/" + str(month) + EXT

    with open(path, "r") as f:

        # cargamos el json en memoria
        docs = json.loads(f.read())[DATA]

        for doc in docs:

            abstract = doc[ABSTRACT]
            snippet = doc[SNIPPET]
            lead = doc[LEAD_PAR]

            regex = fr"\b{syntagm}\b"

            num_abstract = len(re.findall(regex, abstract, re.IGNORECASE))
            num_snippet = len(re.findall(regex, snippet, re.IGNORECASE))
            num_lead = len(re.findall(regex, lead, re.IGNORECASE))

            # contamos el numero de apariciones del sintagma completo
            count_syntagm += num_abstract + num_snippet + num_lead

            if len(split) > 1:

                # contamos el numero de apariciones de cada termino del sintagma
                # mediante una expresion regular
                for word in count_individual_words:
                    regex = fr"\b{word}\b"

                    num_abstract = len(re.findall(regex, abstract, re.IGNORECASE))
                    num_snippet = len(re.findall(regex, snippet, re.IGNORECASE))
                    num_lead = len(re.findall(regex, lead, re.IGNORECASE))

                    count_individual_words[word] += num_abstract + num_snippet + num_lead

    if len(split) > 1:
        return np.array([count_syntagm, sum(count_individual_words.values())])
    else:
        return np.array([count_syntagm, count_syntagm])


In [None]:
# Posibles sintagmas que se me van ocurriendo:
#   bitcoin, cryptocurrency, brexit, internet

def computeData(syntagm, years, months):

    data = dict()

    data[YEAR] = list()
    data[MONTH] = list()
    data[FREQ_ALL_SYNTAGM] = list()
    data[FREQ_SUM_SYNTAGMS] = list()

    for i in years:
        for j in months:

            data[YEAR].append(i)
            data[MONTH].append(j)

            pi = computePi(i, j, syntagm)
            # pi = normPi(pi)

            data[FREQ_ALL_SYNTAGM].append(pi[0])
            data[FREQ_SUM_SYNTAGMS].append(pi[1])

    return data

years = [2019, 2020, 2021, 2022, 2023]
months = ALL_MONTHS
syntagm = "lockdown"

data = computeData(syntagm, years, months)
print(data)

interval = 3
plotOnlyCompleteSyntagm(data, syntagm, interval, FREQ_ALL_SYNTAGM)
# plotAllData(data, syntagm, interval)
# plotAllDataTwoFigs(data, syntagm, interval)

In [None]:

def normAlpha(alpha: np.ndarray):
    return np.array([[alpha[j][i]/max(0.01, alpha[j].sum()) for i in [0, 1]] for j in [0, 1]])


def computeAlpha(alpha: np.ndarray, tau: np.ndarray, pi: np.ndarray, t: int):
    for i in ZERO_ONE:
        for j in ZERO_ONE:
            alpha[j][i] = pi[i][t] * sum(tau[i][j].flatten())
    return alpha


def normBeta(beta: np.ndarray):
    return np.array([[[beta[h][k][j]/max(0.01, beta[h][k].sum()) for j in ZERO_ONE] for k in ZERO_ONE] for h in ZERO_ONE])


def computeBeta(beta: np.ndarray, tau: np.ndarray, pi: np.ndarray, t: int):
    for j in ZERO_ONE:
        for k in ZERO_ONE:
            for h in ZERO_ONE:
                beta[h][k][j] = pi[0][t] * tau[0][j][k][h] + \
                    pi[1][t] * tau[1][j][h][k]
    return beta


def normGamma(gamma: np.ndarray):
    return np.array([gamma[i]/max(0.01, gamma.sum()) for i in ZERO_ONE])


def computeGamma(gamma: np.ndarray, tau: np.ndarray, pi: np.ndarray, t: int):

    for h in ZERO_ONE:
        g = 0
        for i in ZERO_ONE:
            for j in ZERO_ONE:
                for k in ZERO_ONE:
                    g += pi[i][t] * tau[i][j][k][h]

        gamma[h] = g

    return gamma

def normTau(tau: np.ndarray):
    return np.array([[[[tau[i][j][k][h]/max(0.01, tau[i].flatten().sum()) for h in ZERO_ONE] for k in ZERO_ONE] for j in ZERO_ONE] for i in ZERO_ONE])


def computeTau(tau: np.ndarray, alpha: np.ndarray, beta: np.ndarray, nu: np.ndarray, gamma: np.ndarray, t: int):

    for h in ZERO_ONE:
        for k in ZERO_ONE:
            for j in ZERO_ONE:
                for i in ZERO_ONE:

                    tau[h][k][j][i] = alpha[j][i] * \
                        beta[h][k][j] * nu[k][t-1] * gamma[h]
    return tau


def computeNu(nu: np.ndarray, beta: np.ndarray, gamma: np.ndarray, t: int):
    s = [0, 0]
    for h in ZERO_ONE:
        for k in ZERO_ONE:
            s[0] += beta[h][k][0] * nu[k][t-1] * gamma[h]
            s[1] += beta[h][k][1] * nu[k][t-1] * gamma[h]

    nu[0][t] = s[0]
    nu[1][t] = s[1]

    return nu


def dst(a: np.ndarray, b: np.ndarray):

    if a is None or b is None:
        return 100000.0
    else:
        a = a.tolist()
        b = b.tolist()
        return math.sqrt(sum([(x-y)**2 for (x, y) in zip(a, b)]))

In [None]:
def ER(T, pi, Tol, Nmax):
    Psi = []

    nu = np.zeros(shape=(2, T+1))
    nu[0][-1] = 0.95
    nu[1][-1] = 0.05

    for t in range(T):
        prev = None
        iter = 0

        # Inicializar y normalizar \alpha, \beta, \gamma, \tau
        alpha: np.ndarray = np.random.rand(2, 2)
        alpha = normAlpha(alpha)

        beta: np.ndarray = np.random.rand(2, 2, 2)
        beta = normBeta(beta)

        gamma: np.ndarray = np.random.rand(2)
        gamma = normGamma(gamma)

        tau: np.ndarray = np.random.rand(2, 2, 2, 2)
        tau = normTau(tau)

        elst = np.array([])
        elst = np.append(elst, alpha)
        elst = np.append(elst, beta)
        elst = np.append(elst, gamma)
        elst = np.append(elst, tau)

        while iter < Nmax and dst(elst, prev)/max(0.01, dst(elst, np.zeros(elst.shape))) > Tol:
            iter += 1
            prev = elst[:]

            tau = computeTau(tau, alpha, beta, nu, gamma, t)
            tau = normTau(tau)

            alpha = computeAlpha(alpha, tau, pi, t)
            alpha = normAlpha(alpha)

            beta = computeBeta(beta, tau, pi, t)
            beta = normBeta(beta)

            gamma = computeGamma(gamma, tau, pi, t)
            gamma = normGamma(gamma)

            elst = np.array([])
            elst = np.append(elst, alpha)
            elst = np.append(elst, beta)
            elst = np.append(elst, gamma)
            elst = np.append(elst, tau)

        # print(alpha.flatten())
        # print(beta.flatten())
        # print(gamma.flatten())
        # print(tau.flatten())
        # print()
        # print()

        nu = computeNu(nu, beta, gamma, t)

        Psi += [gamma[1]]

    return [Psi[0]] + [max(Psi[i]-Psi[i-1], 0) for i in range(1, len(Psi))]

In [None]:
def flatten(lst):
    res = []
    for ele in lst:
        if type(ele) == list:
            res += flatten(ele)
        else:
            res += [ele]
    return res


#
# Distance between two vectors
#
def dst(a, b):
    if a is None or b is None:
        return 100000.0
    else:
        return math.sqrt(sum([(x-y)**2 for (x, y) in zip(a, b)]))


#
# normalizes alpha, beta, tau
#
def alphanorm(alpha):
    return [[alpha[j][i]/max(0.01, sum(alpha[j])) for i in ZERO_ONE] for j in ZERO_ONE]


def betanorm(beta):
    return [[[beta[h][k][j]/max(0.01, sum(beta[h][k])) for j in ZERO_ONE] for k in ZERO_ONE] for h in ZERO_ONE]


def taunorm(tau):
    return [[[[tau[i][j][k][h]/max(0.01, sum(flatten(tau[i]))) for h in ZERO_ONE] for k in ZERO_ONE] for j in ZERO_ONE ] for i in ZERO_ONE]

def gammanorm(gamma):
    return [gamma[h]/max(0.01, sum(gamma)) for h in ZERO_ONE]

def ER_santini(T, Pi, Tol, Nmax):
    Psi = []
    #
    # Nu[i][t]: Prob(s(t)=i)
    #
    Nu = [[0 for _ in range(T+1)] for _ in range(2)]
    Nu[0][-1] = 0.95
    Nu[1][-1] = 0.05
    for t in range(T):
        prev = None
        iter = 0

        alpha = [[random.uniform(0, 1) for i in ZERO_ONE] for j in ZERO_ONE]
        alpha = alphanorm(alpha)

        beta = [[[random.uniform(0, 1) for j in ZERO_ONE] for k in ZERO_ONE] for h in ZERO_ONE]
        beta = betanorm(beta)

        gamma = [random.uniform(0, 1) for h in ZERO_ONE]
        gamma = [ gamma[h]/max(0.01, sum(gamma)) for h in ZERO_ONE]

        tau = [[[[ random.uniform(0, 1) for h in ZERO_ONE] for k in ZERO_ONE] for j in ZERO_ONE ] for i in ZERO_ONE]
        tau = taunorm(tau)

        elst = flatten(alpha) + flatten(beta) + flatten(gamma) + flatten(tau)
        while iter < Nmax and dst(elst, prev)/max(0.01, dst(elst, len(elst)*[0.0])) > Tol:
            iter += 1
            prev = elst[:]

            # print(Nu)
            tau = [[[[alpha[j][i]*beta[h][k][j]*Nu[k][t-1]*gamma[h] for i in ZERO_ONE] for j in ZERO_ONE] for k in ZERO_ONE ] for h in ZERO_ONE]
            # print(tau)
            tau = taunorm(tau)
            

            alpha = [[Pi[i][t]*sum(flatten(tau[i][j])) for i in ZERO_ONE] for j in ZERO_ONE]
            alpha = alphanorm(alpha)

            for j in ZERO_ONE:
                for k in ZERO_ONE:
                    for h in ZERO_ONE:
                        beta[h][k][j] = Pi[0][t]*tau[0][j][k][h]+Pi[1][t]*tau[1][j][h][k]
            beta = betanorm(beta)

            for h in ZERO_ONE:
                s = 0
                for i in ZERO_ONE:
                    for j in ZERO_ONE:
                        for k in ZERO_ONE:
                            s += Pi[i][t]*tau[i][j][k][h]
                gamma[h] = s
            gamma = [gamma[h]/max(0.01, sum(gamma)) for h in ZERO_ONE]
            elst = flatten(alpha) + flatten(beta) + flatten(gamma) + flatten(tau)
            
        # print(flatten(alpha))
        # print(flatten(beta))
        # print(flatten(gamma))
        # print(flatten(tau))
        # print()
        # print()
        
        s = [0, 0]
        for h in ZERO_ONE:
            for k in ZERO_ONE:
                s[0] += beta[h][k][0]*Nu[k][t-1]*gamma[h]
                s[1] += beta[h][k][1]*Nu[k][t-1]*gamma[h]
        Nu[0][t] = s[0]
        Nu[1][t] = s[1]
        Psi += [gamma[1]]
    return [Psi[0]] + [max(Psi[i]-Psi[i-1], 0) for i in range(1, len(Psi))]



In [None]:
# years = [2015, 2016, 2017, 2018, 2019]
years = [2019, 2020, 2021, 2022, 2023]
months = ALL_MONTHS

T = len(years) * len(months)

# syntagm = "fake news"
syntagm = "lockdown"

data = computeData(syntagm, years, months)

pi = np.array([data[FREQ_SUM_SYNTAGMS], data[FREQ_ALL_SYNTAGM]])

print(pi)
print()

res = ER(T, pi, 0.01, 20)

maximo = 0
ite = 0
for t, r in enumerate(res):
    if r > maximo:
        maximo = r
        ite = t
    print(t, "-->", "%5.24f" % r)

year = math.floor(ite / len(months))
month = ite % len(months)
print(f"Mayor probabilidad en {years[year]}/{MONTHS[month - 1]} con {maximo}")

# # res_s = ER_santini(T, pi, 0.01, 20)

# for r, rs in zip(res, res_s):
#     print("%5.3f" % r == "%5.3f" % rs, "%5.3f" % r, "%5.3f" % rs)

