In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertModel, BertTokenizer


# Carga de modelo pre-entrenado BETO.

In [2]:
tokenizer_español = BertTokenizer.from_pretrained("../pytorch/", do_lower_case=False)
model = BertModel.from_pretrained("../pytorch")

Some weights of the model checkpoint at ../pytorch were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Ejemplo del uso de BERT

In [3]:
enunciado = "no se respeto las fechas de devolución.[PAD]"
print(f"Original: {enunciado}")
print(f"Tokenizado: {tokenizer_español.tokenize(enunciado)}")
print(f"IDs: {tokenizer_español.convert_tokens_to_ids(tokenizer_español.tokenize(enunciado))}")

Original: no se respeto las fechas de devolución.[PAD]
Tokenizado: ['no', 'se', 'respeto', 'las', 'fechas', 'de', 'devolución', '.', '[PAD]']
IDs: [1084, 1062, 5096, 1089, 10286, 1008, 17364, 1009, 1]


In [4]:
encoded = tokenizer_español("hola como estan. ¿Estan muy bien muchachos?")
encoded

{'input_ids': [4, 9050, 1184, 6974, 1009, 1067, 20987, 1456, 1311, 7523, 1064, 5], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
print(tokenizer_español.decode(encoded['input_ids']))

[CLS] hola como estan. ¿ Estan muy bien muchachos? [SEP]


# Repesentación de texto formato BERT.

In [6]:
df = pd.read_csv("../data/processed/FalabellaAyuda_2022-12-27.csv")
df.head()

Unnamed: 0,user_id,user_location,created_at,tweet_text,@usernames_in_tweet,hashtags_in_tweet,sentiment,score
0,228976699,"Santiago, Metropolitana de Santiago",2022-12-27 15:31:58+00:00,todos los dias me responden lo mismos la ha de...,['FalabellaAyuda'],,anger,0.783499
1,821375359102971904,,2022-12-27 15:17:15+00:00,no puedo creer segunda vez en el mes y me canc...,"['FalabellaAyuda', 'SERNAC']",,anger,0.539787
2,91680205,Chile,2022-12-27 15:07:16+00:00,acabo de hacer una compra y me la cancelaron. ...,['FalabellaAyuda'],,others,0.322136
3,1015335612373663746,"Puerto Varas, Los Lagos",2022-12-27 15:02:40+00:00,hola hay intermitencias con la app del banco f...,['FalabellaAyuda'],,others,0.525371
4,1463545921682083844,,2022-12-27 14:17:15+00:00,para que envian cupones de descuento si no res...,['FalabellaAyuda'],,anger,0.553174


In [7]:
texts = df["tweet_text"].head(15)
data = []
for text in texts:
    data.append(text)
data

['todos los dias me responden lo mismos la ha de incompetentes denme solucion',
 'no puedo creer segunda vez en el mes y me cancelaron la compra quiero mi dinero ya no quiero mas 5 dias por que espere dos veces por un producto que no llego y por tus mensajes internos tampoco dan solucion seguire con ellos si no me dan solucion',
 'acabo de hacer una compra y me la cancelaron. donde esta mi plata????',
 'hola hay intermitencias con la app del banco falabella? hace rato que necesito hacer una transferencia y no puedo. agradezco respuesta',
 'para que envian cupones de descuento si no respetan su uso, ahora me indican que solo es posible si se compra en una tienda, cuando en el cupon solo dice que debe ser en falabella, tottus o sodimac, pero aun asi no lo respetan, compre en falabella y sodimac.',
 'la pagina para revisar el saldo de gift cards falabella esta hace varios dias caida. solicito por favor indicar otra forma de revisar el saldo',
 'les recomiendo no comprar, atienden mal, y s

In [8]:
def bert_encoder(texts, tokenizer, max_length_text=140):
    """ Genera las entradas para el modelo BERT.
    Args:
        texts:
            List: lista de textos a tranformar.
        tokenizer:
            BertTokenizer.from_pretrained(): Bertokenizer cargado con BETO.
        max_lengh_text:
            Int: Cantidad maxima de tokens que se generaran,
            se truncara en 140 en caso que sobrepase.
    Returns:
        output:
            Dict: 
    Raises:
    """
    output = {}
    for text in texts:
        tokens = tokenizer.tokenize(text)
        tokens = ["[CLS]"] + tokens + ["[SEP]"]

        pads = max_length_text - len(tokens)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_ids += [1] * pads
        
        token_type_ids = [0] * max_length_text

        attention_mask = [1] * len(tokens) + [0] * pads

        if output:
            output["input_ids"] = torch.vstack((output["input_ids"], torch.tensor([input_ids])))
            output["token_type_ids"] = torch.vstack((output["token_type_ids"], torch.tensor([token_type_ids])))
            output["attention_mask"] = torch.vstack((output["attention_mask"], torch.tensor([attention_mask])))
        else:
            output["input_ids"] = torch.tensor([input_ids])
            output["token_type_ids"] = torch.tensor([token_type_ids])
            output["attention_mask"] = torch.tensor([attention_mask])
    return output

inputs = bert_encoder(enunciado, tokenizer_español)
inputs

{'input_ids': tensor([[   4, 1043,    5,  ...,    1,    1,    1],
         [   4, 1074,    5,  ...,    1,    1,    1],
         [   4,    5,    1,  ...,    1,    1,    1],
         ...,
         [   4, 1070,    5,  ...,    1,    1,    1],
         [   4, 1090,    5,  ...,    1,    1,    1],
         [   4,    3,    5,  ...,    1,    1,    1]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 0,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [9]:
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state
last_hidden_states.shape

torch.Size([44, 140, 768])

# Extracción de características.

In [10]:
def max_polling(tensor):
    output = []
    for vector in tensor:
        features_extracted=[]
        n = len(vector[:])
        for k in range(0, 768):
            max_element = float("-inf")
            for i in range(0, n):
                if (vector[i][k] > max_element):
                    max_element = float(vector[i][k])
            features_extracted.append(max_element)
        output.append(np.array(features_extracted))
    return output
max_polling(last_hidden_states)

[array([-3.07531306e-03,  8.09227824e-01, -3.59586179e-01, -1.08292378e-01,
         9.66239989e-01,  1.66928828e-01,  5.99957667e-02,  4.24479693e-01,
         5.13806045e-01,  3.60637248e-01,  1.20685488e-01,  4.03549880e-01,
         1.35719609e+00,  2.93487161e-01,  1.00186992e+00, -7.72532523e-02,
         2.17831922e+00,  2.89798882e-02, -2.75285810e-01,  4.01720405e-01,
        -2.22278535e-02,  3.54050040e-01,  9.84355032e-01,  6.80680275e-01,
        -1.82278380e-01,  2.20357805e-01,  4.08908993e-01,  6.14010215e-01,
         4.82327133e-01,  6.76137924e-01,  6.43648982e-01,  2.65975326e-01,
         8.54717121e-02,  1.69397786e-01,  7.60307088e-02, -3.00385300e-02,
         5.17450273e-01, -1.64378926e-01,  3.25992048e-01,  7.83719242e-01,
         6.37696326e-01,  4.17610615e-01,  6.52475417e-01,  7.57561088e-01,
         2.65219122e-01,  1.26983619e+00,  8.80463064e-01,  7.79301524e-01,
         2.08759919e-01,  4.48056340e-01,  4.45330113e-01, -5.76443076e-02,
         2.0

In [11]:
def mean_polling(tensor):
    output = []
    for vector in tensor:
        features_extracted=[]
        n=len(vector[:])
        for k in range(0, 768):
            sum_elements=0
            for i in range(0, n):
                sum_elements += float(vector[i][k])
            h=sum_elements/n
            features_extracted.append(h)
        output.append(np.array(features_extracted))
    return output
mean_polling(last_hidden_states)

[array([-3.33874477e-01,  2.48214543e-01, -5.88373512e-01, -8.09131802e-01,
         5.82715046e-01, -7.72061429e-03, -1.32188211e-01,  1.79348995e-01,
         1.15042395e-01,  1.98674013e-01, -5.29603571e-02, -1.79936109e-02,
         6.29468560e-01, -1.08552009e-01,  6.11755851e-01, -5.05616960e-01,
         6.14381923e-01, -3.93833158e-01, -6.05767303e-01,  1.94319382e-01,
        -3.74505785e-01,  1.54465921e-01,  6.13274768e-01,  1.77656305e-01,
        -5.20792358e-01, -7.19520596e-03, -3.53169006e-02,  1.44420531e-01,
         4.08247787e-02, -8.00466144e-02,  3.95040673e-02, -4.72778470e-02,
        -2.22377846e-01, -1.13077056e-01, -1.26467128e-01, -1.86433867e-01,
         1.54182867e-01, -3.17365137e-01,  2.00355541e-01, -3.76238764e-01,
        -4.29604040e-02,  2.81268215e-01,  3.18689579e-01,  5.72036215e-01,
         1.49138567e-01,  1.09090919e-01,  7.42154463e-01, -3.74922771e-02,
        -1.07189898e+00,  2.04759978e-01,  1.64786630e-01, -2.32038929e-01,
         6.4

# Normalización de caracteristicas.

In [12]:
def std_normalization(list):
    output = []
    for vector in list:
        norm1_of_vector = vector.__abs__().sum()
        vector_normalized = vector / norm1_of_vector
        output.append(vector_normalized)
    return output
std_normalization(max_polling(last_hidden_states))

[array([-8.17115869e-06,  2.15013199e-03, -9.55426547e-04, -2.87734677e-04,
         2.56731596e-03,  4.43532713e-04,  1.59409765e-04,  1.12784971e-03,
         1.36519133e-03,  9.58219253e-04,  3.20663378e-04,  1.07223884e-03,
         3.60609292e-03,  7.79800341e-04,  2.66198528e-03, -2.05263195e-04,
         5.78783092e-03,  7.70000520e-05, -7.31439044e-04,  1.06737789e-03,
        -5.90597820e-05,  9.40716932e-04,  2.61544793e-03,  1.80857898e-03,
        -4.84316731e-04,  5.85494408e-04,  1.08647810e-03,  1.63143550e-03,
         1.28155133e-03,  1.79650988e-03,  1.71018621e-03,  7.06700931e-04,
         2.27099781e-04,  4.50092777e-04,  2.02014877e-04, -7.98128812e-05,
         1.37487411e-03, -4.36757581e-04,  8.66166373e-04,  2.08235525e-03,
         1.69436990e-03,  1.10959845e-03,  1.73363819e-03,  2.01285259e-03,
         7.04691684e-04,  3.37397618e-03,  2.33940522e-03,  2.07061730e-03,
         5.54678627e-04,  1.19049326e-03,  1.18324963e-03, -1.53161898e-04,
         5.4

In [13]:
# tiene otro nombre z-score normalization (plis regularizar esto xd)
def z_score_normalization(list):
    output = []
    for vector in list:
        vector_normalized = (vector - vector.mean()) / vector.std()
        output.append(vector_normalized)
    return output
z_score_normalization(max_polling(last_hidden_states))

[array([-7.71608985e-01,  7.44628536e-01, -1.43706887e+00, -9.68006186e-01,
         1.03770599e+00, -4.54280829e-01, -6.53881091e-01,  2.64612338e-02,
         1.93196976e-01, -9.27064846e-02, -5.40598219e-01, -1.26061622e-02,
         1.76746104e+00, -2.18048214e-01,  1.10421248e+00, -9.10068841e-01,
         3.30016186e+00, -7.11775043e-01, -1.27971458e+00, -1.60210437e-02,
        -8.07358940e-01, -1.05002105e-01,  1.07151936e+00,  5.04682870e-01,
        -1.10610777e+00, -3.54550793e-01, -2.60289109e-03,  3.80237152e-01,
         1.34438731e-01,  4.96204160e-01,  4.35560601e-01, -2.69401551e-01,
        -6.06327928e-01, -4.49672294e-01, -6.23950417e-01, -8.21938277e-01,
         1.99999258e-01, -1.07269682e+00, -1.57374894e-01,  6.97014452e-01,
         4.24449428e-01,  1.36394769e-02,  4.52035942e-01,  6.48187883e-01,
        -2.70813075e-01,  1.60439585e+00,  8.77595575e-01,  6.88768381e-01,
        -3.76199301e-01,  7.04691851e-02,  6.53804349e-02, -8.73466966e-01,
         3.0

In [14]:
def max_min_normalization(list, new_min=0.0, new_max=1.0):
    output = []
    for vector in list:
        vector_normalized = (((vector - vector.min()) / (vector.max() - vector.min())) * (new_max - new_min)) + new_min
        output.append(vector_normalized)
    return output
max_min_normalization(max_polling(last_hidden_states))

[array([0.407337  , 0.47938944, 0.37571398, 0.3980041 , 0.49331665,
        0.42241661, 0.41293149, 0.44526174, 0.45318511, 0.43959882,
        0.41831476, 0.44340523, 0.52799501, 0.43364251, 0.49647707,
        0.40075731, 0.6008298 , 0.41018034, 0.38319154, 0.44324296,
        0.40563814, 0.43901453, 0.49492348, 0.46798709, 0.39144143,
        0.42715584, 0.44388059, 0.46207336, 0.45039289, 0.46758418,
        0.46470236, 0.43120217, 0.41519125, 0.42263561, 0.41435382,
        0.40494532, 0.45350836, 0.39302914, 0.43652574, 0.4771268 ,
        0.46417436, 0.44465244, 0.46548528, 0.47480653, 0.4311351 ,
        0.52024606, 0.48570811, 0.47673494, 0.42612709, 0.44735302,
        0.4471112 , 0.40249665, 0.59041812, 0.38377886, 0.41746505,
        0.43596217, 0.42261651, 0.4795086 , 0.44542892, 0.52845243,
        0.38095485, 0.47098031, 0.42636501, 0.43566204, 0.45502286,
        0.38858816, 0.46388229, 0.47752081, 0.42066225, 0.45798161,
        0.43283022, 0.38822005, 0.35663428, 0.47

In [15]:
X = max_min_normalization(max_polling(last_hidden_states))

# Implementación de algoritmos de agrupamiento.

In [16]:
from sklearn.cluster import KMeans
for seed in range(5):
    kmeans = KMeans(
        n_clusters=5,
        init="random",
        max_iter=100,
        n_init=1,
        random_state=seed,
    ).fit(X)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements asigned to each cluster: {cluster_sizes}")
    print(f"labels:{kmeans.labels_}")

Number of elements asigned to each cluster: [ 9  5 10 16  4]
labels:[1 0 2 0 3 2 3 3 0 1 3 2 0 2 0 3 0 2 3 3 4 1 3 0 2 4 3 2 4 3 3 0 0 3 4 3 3
 1 3 2 1 3 2 2]
Number of elements asigned to each cluster: [ 4 10 16  5  9]
labels:[3 4 1 4 2 1 2 2 4 3 2 1 4 1 4 2 4 1 2 2 0 3 2 4 1 0 2 1 0 2 2 4 4 2 0 2 2
 3 2 1 3 2 1 1]
Number of elements asigned to each cluster: [16  9  4  5 10]
labels:[3 1 4 1 0 4 0 0 1 3 0 4 1 4 1 0 1 4 0 0 2 3 0 1 4 2 0 4 2 0 0 1 1 0 2 0 0
 3 0 4 3 0 4 4]
Number of elements asigned to each cluster: [ 9  6 12  4 13]
labels:[4 0 2 0 4 2 4 4 0 2 4 2 0 2 0 1 0 2 4 4 3 4 1 0 2 3 4 2 3 4 4 0 0 1 3 1 4
 4 1 2 2 1 2 2]
Number of elements asigned to each cluster: [ 9 10 15  6  4]
labels:[2 0 1 0 2 1 2 2 0 2 2 1 0 1 0 3 0 1 2 2 4 2 3 0 1 4 2 1 4 2 2 0 0 3 4 3 2
 2 3 1 2 3 1 1]


In [17]:
# K-means++
for seed in range(5):
    kmeans = KMeans(
        n_clusters=5,
        init="k-means++",
        max_iter=100,
        n_init=1,
        random_state=seed,
    ).fit(X)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    print(f"Number of elements asigned to each cluster: {cluster_sizes}")
    print(f"labels:{kmeans.labels_}")

Number of elements asigned to each cluster: [ 5  4  9 10 16]
labels:[0 2 3 2 4 3 4 4 2 0 4 3 2 3 2 4 2 3 4 4 1 0 4 2 3 1 4 3 1 4 4 2 2 4 1 4 4
 0 4 3 0 4 3 3]
Number of elements asigned to each cluster: [ 5  4  9 10 16]
labels:[0 2 3 2 4 3 4 4 2 0 4 3 2 3 2 4 2 3 4 4 1 0 4 2 3 1 4 3 1 4 4 2 2 4 1 4 4
 0 4 3 0 4 3 3]
Number of elements asigned to each cluster: [13  4  7 18  2]
labels:[3 2 0 2 3 0 3 3 2 0 3 0 2 0 4 3 2 0 3 3 1 0 3 2 0 1 3 0 1 3 3 2 4 3 1 3 3
 3 3 0 0 3 0 0]
Number of elements asigned to each cluster: [12  8  4 18  2]
labels:[3 1 0 1 3 0 3 3 1 0 3 0 1 0 4 3 1 0 3 3 2 3 3 1 0 2 3 0 2 3 3 1 4 3 2 1 3
 3 3 0 0 3 0 0]
Number of elements asigned to each cluster: [13  2 18  4  7]
labels:[2 4 0 4 2 0 2 2 4 0 2 0 4 0 1 2 4 0 2 2 3 0 2 4 0 3 2 0 3 2 2 4 1 2 3 2 2
 2 2 0 0 2 0 0]


# extracción de ventos.