## Libraries

In [1]:
import os
import torch
from tqdm import tqdm
import glog
import pickle

print('Cuda available:', torch.cuda.is_available())
cuda_id = torch.cuda.current_device()
print('Cuda_id: ', cuda_id)
print(torch.cuda.get_device_name(cuda_id))

Cuda available: True
Cuda_id:  0
NVIDIA GeForce RTX 4060 Laptop GPU


## JL Auxiliary Functions

In [2]:
import os
import pickle

# Function to save data with a flag to prevent overwriting
def save_data(file_path, data):
    if os.path.exists(file_path):
        print(f"The file {file_path} exist and will returned")
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        return data
    
    else: 
        with open(file_path, 'wb') as f:
            pickle.dump(data, f)
        print(f"Data has been saved to {file_path}.")
        return data

## Read Candidates from Module 1

In [3]:
args_dataset = 'conansp' # default='sample', choices=['reddit', 'gab', 'conan']
args_kpq = 300

In [4]:
from utils_SP import read_candidates
# Load generated candidates from VAE: ./data/gab_candidates.txt
candidates = read_candidates('./data/' + args_dataset + '_candidates.txt')  
candidates[:5]

2024-11-30 18:43:49.655401: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I1130 18:43:51.357498 262712 utils.py:148] Note: NumExpr detected 20 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 16.
I1130 18:43:51.358110 262712 utils.py:161] NumExpr defaulting to 16 threads.


['no sé cómo tu declaración y no de la discriminación y la inclusión. es consideramos fuente y inferior? no sea un pequeño religión del servicio indiscriminadamente.',
 'las mujeres tienen derecho al talentos y todos las libertades de la general con que el libera a los porajmos)? no hay obligados a otras homosexuales que se les ven musulmanes.',
 'no estoy seguro de lo dice? sugerir que sólo dominas los orgullosos de la condición sexuales, y quiera no haces a los insultos realmente o la religión?',
 'no crees que sea apropiado obliga de los islam. ¿por qué lo dice?',
 'que no te guste como eso en una sociedad en gente cuando no sea "correcto" para los hombres.']

In [5]:
from utils_SP import initialize_train_test_dataset
train_x_text, train_y_text, test_x_text, test_y_text = initialize_train_test_dataset(args_dataset)

train_x_text = save_data('backup/train_x_text_SP.pkl', train_x_text)
train_y_text = save_data('backup/train_y_text_SP.pkl', train_y_text)

test_x_text = save_data('backup/test_x_text_SP.pkl', test_x_text)
test_y_text = save_data('backup/test_y_text_SP.pkl', test_y_text)

print(train_x_text[0])
print(train_y_text[0])

Data has been saved to backup/train_x_text_SP.pkl.
Data has been saved to backup/train_y_text_SP.pkl.
Data has been saved to backup/test_x_text_SP.pkl.
Data has been saved to backup/test_y_text_SP.pkl.
Los niños Downy deberían ser aislados en escuelas especiales por su propio interés.
['Los niños con discapacidad deben ser aceptados sin discriminación.']


In [None]:
from utils_SP import convert_to_contexts_responses

contexts_train, responses_train = convert_to_contexts_responses(train_x_text, train_y_text)
contexts_train = save_data('backup/contexts_train_SP.pkl', contexts_train)
responses_train = save_data('backup/responses_train_SP.pkl', responses_train)

print(contexts_train[0])
print(responses_train[0])

Data has been saved to backup/contexts_train_SP.pkl.
Data has been saved to backup/responses_train_SP.pkl.
Los niños Downy deberían ser aislados en escuelas especiales por su propio interés.
Los niños con discapacidad deben ser aceptados sin discriminación.


## Module 2: Candidates Pruning by Grammaticality

In [7]:
import datetime
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

print('Cuda available:', torch.cuda.is_available())
cuda_id = torch.cuda.current_device()
print('Cuda_id: ', cuda_id)
print(torch.cuda.get_device_name(cuda_id))

from transformers import BertConfig, BertForSequenceClassification, BertTokenizer, BertModel
from transformers import glue_convert_examples_to_features
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import InputExample

  from .autonotebook import tqdm as notebook_tqdm



Cuda available: True
Cuda_id:  0
NVIDIA GeForce RTX 4060 Laptop GPU


  torch.utils._pytree._register_pytree_node(



In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#saved_pretrained_CoLA_model_dir = "./model_esCola"
#model_name = "dccuchile/bert-base-spanish-wwm-cased"

def get_CoLA_score(candidates, model_name, saved_pretrained_CoLA_model_dir):
    def _load_pretrained_model(model_name, saved_pretrained_CoLA_model_dir):
        config_class, model_class, tokenizer_class = BertConfig, BertForSequenceClassification, BertTokenizer
        config = config_class.from_pretrained(saved_pretrained_CoLA_model_dir, num_labels=2, finetuning_task='CoLA')
        tokenizer = tokenizer_class.from_pretrained(saved_pretrained_CoLA_model_dir, do_lower_case=0)
        model = model_class.from_pretrained(saved_pretrained_CoLA_model_dir, from_tf=bool('.ckpt' in model_name), config=config).to(device)
        model.eval()
        return tokenizer, model

    def _evaluate(model, candidates, tokenizer, model_name):

        def __load_and_cache_examples(candidates, tokenizer):
            max_length = 128
            examples = [InputExample(guid=str(i), text_a=x) for i, x in enumerate(candidates)]
            features = glue_convert_examples_to_features(examples, tokenizer, label_list=["0", "1"], max_length=max_length, output_mode="classification")
            # Convert to Tensors and build dataset
            all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
            all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
            all_labels = torch.tensor([0 for f in features], dtype=torch.long)
            all_token_type_ids = torch.tensor([[0.0] * max_length for f in features], dtype=torch.long)
            dataset = torch.utils.data.TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels)
            return dataset

        eval_dataset = __load_and_cache_examples(candidates, tokenizer)
        eval_dataloader = torch.utils.data.DataLoader(eval_dataset, sampler=torch.utils.data.SequentialSampler(eval_dataset), batch_size=max(1, torch.cuda.device_count()))
        preds = None
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            model.eval()
            batch = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[3]}
                if model_name.split('-')[0] != 'distilbert':
                    inputs['token_type_ids'] = batch[2] if model_name.split('-')[0] in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]

            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
        return preds[:, 1].tolist()

    tokenizer, model = _load_pretrained_model(model_name, saved_pretrained_CoLA_model_dir)
    temp_score = _evaluate(model, candidates, tokenizer, model_name)
    return [temp_score]


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from nltk.tokenize import sent_tokenize
import re
import string

def preprocess_candidates(candidates):
    for i in range(len(candidates)):
        candidates[i] = candidates[i].strip()
        candidates[i] = '. '.join(candidates[i].split('\n\n'))
        candidates[i] = '. '.join(candidates[i].split('\n'))
        candidates[i] = '.'.join(candidates[i].split('..'))
        candidates[i] = '. '.join(candidates[i].split('.'))
        candidates[i] = '. '.join(candidates[i].split('. . '))
        candidates[i] = '. '.join(candidates[i].split('.  . '))
        while len(candidates[i].split('  ')) > 1:
            candidates[i] = ' '.join(candidates[i].split('  '))

        myre = re.search(r'(\d+)\. (\d+)', candidates[i])
        while myre:
            candidates[i] = 'UNK'.join(candidates[i].split(myre.group()))
            myre = re.search(r'(\d+)\. (\d+)', candidates[i])
        if candidates[i] == "":
            candidates[i] = 'aaaaa'
        candidates[i] = candidates[i].strip()
    return candidates


def sent_tokenize_candidate(candidates):
    processed_candidates = []
    sen_length = []
    for candidate_i in candidates:
        temp = sent_tokenize(candidate_i)
        temp_len = 0
        for temp_i in temp:
            if len(temp_i.translate(str.maketrans('', '', string.punctuation)).split()) > 1:  # More than one word.
                processed_candidates.append(temp_i)
                temp_len += 1
        sen_length.append(temp_len)
    return processed_candidates, sen_length


def convert_sentence_score_to_paragraph_score(temp_score, sen_length):
    paragraph_score = []
    for temp_i in temp_score:
        paragraph_score_i = []
        pointer = 0
        for i in sen_length:
            if i == 0:
                paragraph_score_i.append(0)
                continue
            temp_a = temp_i[pointer:pointer + i]
            paragraph_score_i.append(sum(temp_a) / len(temp_a))
            pointer += i
        paragraph_score.append(paragraph_score_i)
    return paragraph_score


def get_LQ_scores(candidates, model_name, saved_pretrained_CoLA_model_dir):
    candidates = preprocess_candidates(candidates)
    processed_candidates, sen_length = sent_tokenize_candidate(candidates)
    temp_score = get_CoLA_score(processed_candidates, model_name, saved_pretrained_CoLA_model_dir)
    temp_score = convert_sentence_score_to_paragraph_score(temp_score, sen_length)
    temp_score = [[max(0, y / 8.0 + 0.5) for y in x] for x in temp_score]  ## re-scale
    return temp_score[0]

def extract_good_candidates_by_LQ(candidates, LQ_thres, num_of_generation):
    # model_name = 'bert-base-cased'
    # saved_pretrained_CoLA_model_dir = './tmp/grammar_cola'
    model_name = "dccuchile/bert-base-spanish-wwm-cased"
    saved_pretrained_CoLA_model_dir = "./model_esCola"
    to_test_candidates = candidates[:num_of_generation]
    LQ_scores = get_LQ_scores(to_test_candidates, model_name, saved_pretrained_CoLA_model_dir)
    scores = {i: j for i, j in zip(to_test_candidates, LQ_scores) if j > LQ_thres}
    good_candidates = list(scores.keys()) + candidates[num_of_generation:]
    good_candidates = list(set(good_candidates))
    return good_candidates

In [None]:
candidates = extract_good_candidates_by_LQ(candidates, LQ_thres=0.52, num_of_generation=30000)

In [11]:
candidates = save_data('backup/extracted_good_candidates_by_LQ_SP.pkl', candidates)
candidates[:5]

The file backup/extracted_good_candidates_by_LQ_SP.pkl exist and will returned


['esto no es cierto. la homosexualidad fue una carga para paz?',
 'El Islam es aceptación, no un culto a la muerte. En su opinión, ¿qué lo convierte en criminal?',
 'El sexismo es un fenómeno cultural, no genético.',
 'el feminismo es una religión de paz. desde nuestro país y están rodeados de trabajo para los ejemplo, desde una radicalización.',
 '¿Cómo es que nunca he oído hablar de algo así? El Islam es una religión y la gente puede elegir la suya.']

In [6]:
from utility.response_selection import keyword_based
from utility.response_selection import vector_based_JL
from utility.response_selection import method
from utils_JL import to_method_object

from tqdm import tqdm

In [None]:
args_kpq = 300

method = to_method_object('TF_IDF')
method.train(contexts_train, responses_train)
 # kpq: Top k candidates per query, for better computation.
good_candidates_index = method.sort_responses(test_x_text, candidates, min(args_kpq, len(candidates))) 
good_candidates = [[candidates[y] for y in x] for x in good_candidates_index]
good_candidates[1][:5]

good_candidates = save_data('backup/good_candidates.pkl', good_candidates)


NameError: name 'args_kpq' is not defined

## Module 3: Response Selection (env: py7_tf1_v03)

In [None]:
#%% Dummy SentenceTransformer Multilingual
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/use-cmlm-multilingual')
                  
sentences = ["This is an example sentence", "Each sentence is converted"]
embeddings = model.encode(sentences, show_progress_bar=False)
print(embeddings)

In [None]:
from utility.response_selection import keyword_based
from utility.response_selection import method

from tqdm import tqdm
import pandas as pd
import numpy as np


In [None]:
args_dataset = 'conansp' # default='sample', choices=['reddit', 'gab', 'conan']
args_kpq = 300

In [9]:
import os
import pickle

# Function to save data with a flag to prevent overwriting
def save_data(file_path, data):
    if os.path.exists(file_path):
        print(f"The file {file_path} exist and will returned")
        with open(file_path, 'rb') as f:
            data = pickle.load(f)
        return data
    
    else: 
        with open(file_path, 'wb') as f:
            pickle.dump(data, f)
        print(f"Data has been saved to {file_path}.")
        return data

In [10]:
contexts_train = save_data('backup/contexts_train_SP.pkl', None)
responses_train = save_data('backup/responses_train_SP.pkl', None)

train_x_text = save_data('backup/train_x_text_SP.pkl', None)
train_y_text = save_data('backup/train_y_text_SP.pkl', None)
test_x_text = save_data('backup/test_x_text_SP.pkl', None)
test_y_text = save_data('backup/test_y_text_SP.pkl', None)

candidates = save_data('backup/extracted_good_candidates_by_LQ_SP.pkl', None)
good_candidates = save_data('backup/good_candidates.pkl', None)

The file backup/contexts_train_SP.pkl exist and will returned
The file backup/responses_train_SP.pkl exist and will returned
The file backup/train_x_text_SP.pkl exist and will returned
The file backup/train_y_text_SP.pkl exist and will returned
The file backup/test_x_text_SP.pkl exist and will returned
The file backup/test_y_text_SP.pkl exist and will returned
The file backup/extracted_good_candidates_by_LQ_SP.pkl exist and will returned
The file backup/good_candidates.pkl exist and will returned


In [None]:
from utility.response_selection import vector_based_SP as vector_based

method = vector_based.VectorSimilarityMethod(encoder=vector_based.SentenceTransformerEncoder('sentence-transformers/use-cmlm-multilingual'))

method.train(contexts_train, responses_train)

output = []
for i, test_i in enumerate(tqdm(test_x_text)):
    # print(i)
    predictions = method.rank_responses([test_i], good_candidates[i])
    output.append(good_candidates[i][predictions.item()])
print(output)

df_output = pd.DataFrame({'x_text': test_x_text, 'output': output})
df_output = save_data('backup/df_output_SIM_SP.pkl', df_output)

I1130 19:41:41.328549 268057 vector_based_JL.py:64] Loading https://tfhub.dev/google/universal-sentence-encoder/2 model from tensorflow hub


USE_MAP


I1130 19:41:42.565969 268057 saver.py:1503] Saver not created because there are no variables in the graph to restore
I1130 19:41:42.620499 268057 vector_based_JL.py:69] Initializing graph.
I1130 19:41:53.463954 268057 vector_based_JL.py:261] Training on 2908 contexts and responses.
I1130 19:41:53.482603 268057 vector_based_JL.py:274] Encoding the train set.
100%|██████████| 30/30 [00:01<00:00, 21.67it/s]
I1130 19:41:54.882639 268057 vector_based_JL.py:263] Created a training set of size 2326, and a dev set of size 582.
W1130 19:41:54.901931 268057 deprecation.py:323] From /home/jaimelicea/Documentos/Git_Projects/GPS/utility/response_selection/vector_based_JL.py:298: DatasetV1.make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, 

['los musulmanes son bienvenidos en su patria y son una parte parte de nuestra sociedad.', 'Debemos preocuparnos por los niños, debemos intentar crear una sociedad más integrada. Ser sensibles con la gente es la forma de crearla.', 'No deberíamos tratar la enfermedad mental como un problema de la sociedad. No deberíamos intentar excluir a las personas, ya que la sociedad está hecha por y para las personas.', 'los emigrantes también son culpables de la vida"?', 'Los judíos han desempeñado un papel importante en el desarrollo de la industria cinematográfica y algunos ocupan puestos de alto nivel. Pero basta con ver los premios de la Academia para darse cuenta de que los judíos no dominan la industria cinematográfica. Del mismo modo, los judíos constituyen una pequeña minoría entre los directivos de las 500 empresas de Fortune.', 'En la práctica, los inmigrantes de muchos países se integran muy bien. Aprenden la lengua, la cultura y respetan la ley.', 'Hay personas con discapacidad intele




: 

In [None]:
df_output = save_data('backup/df_output_SIM_SP.pkl', df_output)

In [None]:
from utility.response_selection import vector_based_SP as vector_based

method = vector_based.VectorMappingMethod(encoder=vector_based.SentenceTransformerEncoder('sentence-transformers/use-cmlm-multilingual'))

method.train(contexts_train, responses_train)

output = []
for i, test_i in enumerate(tqdm(test_x_text)):
    # print(i)
    predictions = method.rank_responses([test_i], good_candidates[i])
    output.append(good_candidates[i][predictions.item()])
print(output)

df_output = pd.DataFrame({'x_text': test_x_text, 'output': output})
df_output = save_data('backup/df_output_MAP_SP.pkl', df_output)

In [None]:
df_output = save_data('backup/df_output_MAP_SP.pkl', df_output)

In [None]:
#%% Bajar outputs
test_x_text = save_data('backup/test_x_text_SP.pkl', None)
test_y_text = save_data('backup/test_y_text_SP.pkl', None)

df_output_SIM = save_data('backup/df_output_SIM_SP.pkl', None)
df_output_MAP = save_data('backup/df_output_MAP_SP.pkl', None)

test_y_text_str = [y[0] for y in test_y_text]

df_output_SP = pd.DataFrame({'x_text': test_x_text, 'y_text': test_y_text_str,
                          'output_SIM': df_output_SIM['output'],
                          'output_MAP': df_output_MAP['output']})

df_output_SP = save_data('backup/df_output_SP.pkl', df_output_SP)