<a href="https://colab.research.google.com/github/josedossantos10/TuningSentenceModels/blob/PROPOR2024/tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PROPOR 2024
#### Link dos resultados --> [https://docs.google.com/spreadsheets/d/19vP3u3dZFDkp8XztXWamhCwpjCk3tMvMW9m8FtnXQZw/edit#gid=1592378179]

Versão particionada

## Preparação

In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, SentencesDataset, util, SentencesDataset, InputExample, losses, models
from torch.utils.data import DataLoader
from google.colab import drive
import torch
import requests
from itertools import combinations
import numpy as np
import pandas as pd
from tqdm import tqdm
import shutil
import os
import re
from torch import nn
all_models = {}


In [None]:
os.makedirs('models',exist_ok=True)
os.makedirs('data',exist_ok=True)
# drive.mount('/content/drive')

In [None]:
def get_file(url, folder='data'):
  r = requests.get(url.split('?').pop(0)+'?download=1')
  d = r.headers['content-disposition']
  fname = re.findall("filename=(.+)", d)[0].replace('"','')
  with open(f'/content/{folder}/{fname}','wb') as f:
    f.write(r.content)
  if '.zip' in fname:
    !unzip '/content/{folder}/{fname}' -d '/content/{folder}/'
    print('Unziped and',end=' ')
  print('Saved '+fname)

In [None]:
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/EQcLlSaOKPtCtzY3juKPkpYB6Rl9Jnp-xNimWaIcJNe_bg?e=jc0ZWR')
get_file('https://ufrpebr-my.sharepoint.com/:x:/g/personal/joseantonio_santos_ufrpe_br/EfciCkFDXkxAoKnvwW-O0FYB1YMf-iXIwD1nVdf0Ve_t8g?e=X1QPUi')
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/ESigtraTg-xLjUAwHuEzF0sB-CkOhU75tNSZcomdbuvpxg?e=AcMaXi')
# get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/EQuYJnSjStVPmJ6Ec9-9Z3YBO32NpcNcCbbK0fad7NWTgw?download=1','models')


Archive:  /content/data/base_20230428_douglas-bill_corpus-parts.zip
replace /content/data/base_20230428_douglas-bill_corpus-part0.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace /content/data/base_20230428_douglas-bill_corpus-part1.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Unziped and Saved base_20230428_douglas-bill_corpus-parts.zip
Saved pares-arvores-proposicao-filter.csv
Archive:  /content/data/dados-conle-anonimizado-job_request-parts.zip
replace /content/data/dados-conle-anonimizado-job_request-part0.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: N
Unziped and Saved dados-conle-anonimizado-job_request-parts.zip


In [None]:
def build_model(version, mode=False):
    if mode:
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        if version=='sbert':
            word_embedding_model = models.Transformer("neuralmind/bert-large-portuguese-cased")
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=True,
                                        pooling_mode_cls_token=False,
                                        pooling_mode_max_tokens=False)
        # elif version=='lbert':
        #     word_embedding_model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")[0]
        #     pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
        #                                 pooling_mode_mean_tokens=True,
        #                                 pooling_mode_cls_token=False,
        #                                 pooling_mode_max_tokens=False)
        elif version=='lbert':
            word_embedding_model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")[0]
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=True,
                                        pooling_mode_cls_token=False,
                                        pooling_mode_max_tokens=False)
        elif version=='labse':
            word_embedding_model = models.Transformer("sentence-transformers/LabSE")
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=False,
                                        pooling_mode_cls_token=True,
                                        pooling_mode_max_tokens=False)
        elif version=='lbt':
            word_embedding_model = models.Transformer("rufimelo/Legal-BERTimbau-large")
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=True,
                                        pooling_mode_cls_token=False,
                                        pooling_mode_max_tokens=False)
        else:
            return SentenceTransformer(f'./models/{version}/')

        word_embedding_model.max_seq_length=512
        # Apply mean pooling to get one fixed sized sentence vector
        #pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False)
        #dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=pooling_model.get_sentence_embedding_dimension(),
         #                          activation_function=nn.Tanh())
        #norm = models.Normalize()
        # dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=768, activation_function=nn.Tanh())

        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

        return model

    # Building dataloader and trianing model
    if version=='sbert':
        model = SentenceTransformer("neuralmind/bert-large-portuguese-cased")
    elif version=='lbert':
        model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")
    elif version=='lbertt':
        model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")
    elif version=='albert':
        model = SentenceTransformer("PORTULAN/albertina-ptbr")
    elif version=='allbert':
        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    elif version=='disbert':
        model = SentenceTransformer("distilroberta-base")
    elif version=='labse':
        model = SentenceTransformer("sentence-transformers/LabSE")
    elif version=='lbt':
        model = SentenceTransformer("rufimelo/Legal-BERTimbau-large")
    else:
        model = SentenceTransformer(f"./models/{version}/")
    model.max_seq_length=512
    return model

def finetuning(version, train_dataset, epochs=1, batch_size=2):
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    # Building dataloader and trianing model
    model = build_model(version, True)
    train_loss = losses.ContrastiveLoss(model)
    # train_loss = losses.CosineSimilarityLoss(model)
    # train_loss = losses.MultipleNegativesRankingLoss(model)
    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs,show_progress_bar=True)#, checkpoint_path=f"./../models/{column_data}/checkpoint_{version}", output_path=f"./../models/{column_data}/model_{version}", save_best_model=True )
    return model

def get_name(df, doc, in_field="content", out_field = "name"):
    return str(df[df[in_field]==doc][out_field].to_numpy()[0]).strip()
def recall(vector):
    return sum([1 for (target, docs) in vector if target in docs])/len(vector)
def evaluate(model, top_k,corpus, queries):
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    res_vector = list()
    for query in queries:
        try:
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
            top_results = torch.topk(cos_scores, k=top_k)

            if torch.cuda.is_available():
                top_resuts_txt = [get_name(df, txt, in_field=column_data, out_field="txtNome").strip() for txt in corpus[top_results[1].cpu().numpy()]]
            else:
                top_resuts_txt = [get_name(df, txt, in_field=column_data, out_field="txtNome").strip() for txt in corpus[top_results[1].numpy()]]

            res_vector.append([get_name(df_assunto, query, "TxtAssunto", "NÚMERO-PROPOSIÇÃOSILEG").strip(), top_resuts_txt])
        except Exception as e:
            print(f"ERRO ao realizar encoding:{e}")
            pass
    return recall(res_vector)

## Lbertimbau

In [None]:
# column_data = 'txtEmenta'
# column_data = 'txtIndexacao'
# column_data = 'txtInteiroTeor'
column_data = 'txtInteiroTeorLimpo'
top_k = 20
runs = 5
epochs = 1
version = 'labse'
i=0
read_df = pd.read_csv(f'/content/data/pares-arvores-proposicao-filter.csv').rename(columns={'imgArquivoTeorPDF_clean_1':'txtInteiroTeorLimpo_1','imgArquivoTeorPDF_clean_2':'txtInteiroTeorLimpo_2'}).dropna().reset_index(drop=True)
read_df['label'] = read_df.label.apply(int)
train_dataset = read_df.apply(lambda x: InputExample(texts=(x[f'{column_data}_1'], x[f'{column_data}_2']), label=x['label']), axis=1)

model_tuned = finetuning(version, train_dataset, epochs=epochs, batch_size=2)
for e in tqdm(range(0, runs)):
    element = f'{column_data}_{version}_{e}'
    df = pd.read_csv(f"/content/data/base_20230428_douglas-bill_corpus-part{i}.csv")
    df_assunto = pd.read_csv(f"/content/data/dados-conle-anonimizado-job_request-part{i}.csv", encoding="utf-8")
    queries = df_assunto["TxtAssunto"].to_numpy()
    corpus = df[column_data].to_numpy()
    zero_shot = evaluate(build_model(version),top_k,corpus, queries)
    tuned = evaluate(model_tuned,top_k,corpus, queries)
    all_models[element] = (zero_shot, tuned)
    tqdm.write(f'{version} untuned Recall: {round(zero_shot*100,2)} tuned Recall:{round(tuned*100,2)}')
    i+=1
recalls_zero = [all_models[e][0] for e in all_models]
recalls_tuned = [all_models[e][1] for e in all_models]
print(f'zero-shot --> Média: {round(np.average(recalls_zero)*100,2)}. Desvio Padrão: {np.std(recalls_zero)}')
print(f'finetuning --> Média: {round(np.average(recalls_tuned)*100,2)}. Desvio Padrão: {np.std(recalls_tuned)}')

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2164 [00:00<?, ?it/s]

 20%|██        | 1/5 [14:03<56:14, 843.58s/it]

labse untuned Recall: 40.0 tuned Recall:48.89


 40%|████      | 2/5 [27:58<41:54, 838.32s/it]

labse untuned Recall: 52.7 tuned Recall:66.22


 60%|██████    | 3/5 [42:05<28:04, 842.35s/it]

labse untuned Recall: 53.73 tuned Recall:70.15


 80%|████████  | 4/5 [56:00<13:59, 839.70s/it]

labse untuned Recall: 64.15 tuned Recall:83.02


100%|██████████| 5/5 [1:09:53<00:00, 838.80s/it]

labse untuned Recall: 60.71 tuned Recall:71.43
zero-shot --> Média: 54.26. Desvio Padrão: 0.08313131997507
finetuning --> Média: 67.94. Desvio Padrão: 0.11048433870929471





In [None]:
zero_shot

0.022222222222222223

In [None]:
tuned

0.0

In [None]:
element

['txtInteiroTeor_lbt_0']

In [None]:
all_models[element[0]] = (zero_shot, tuned)

In [None]:
# column_data = 'txtEmenta'
# column_data = 'txtIndexacao'
column_data = 'txtInteiroTeor'
# column_data = 'txtInteiroTeorLimpo'
top_k = 20
runs = 5
epochs = 1
version = 'lbt'
i=0
read_df = pd.read_csv(f'/content/data/base_20230428_douglas-pair_bill_corpus-lite.csv').dropna().reset_index(drop=True)
read_df['label'] = read_df.label.apply(int)
train_dataset = read_df.apply(lambda x: InputExample(texts=(x[f'{column_data}_1'], x[f'{column_data}_2']), label=x['label']), axis=1)

for e in tqdm(range(0, runs)):
    element = [f'{column_data}_{version}_{e}']
    df = pd.read_csv(f"/content/data/base_20230428_douglas-bill_corpus-part{i}.csv")
    df_assunto = pd.read_csv(f"/content/data/dados-conle-anonimizado-job_request-part{i}.csv", encoding="utf-8")
    queries = df_assunto["TxtAssunto"].to_numpy()
    corpus = df[column_data].to_numpy()
    zero_shot = evaluate(build_model(version),top_k,corpus)
    model = finetuning(version, train_dataset, epochs=epochs, batch_size=2)
    tuned = evaluate(model,top_k,corpus)
    all_models[element] = (zero_shot, tuned)
    tqdm.write(f'{version} untuned Recall: {round(zero_shot*100,2)} tuned Recall:{round(tuned*100,2)}')
    i+=1
recalls_zero = [all_models[e][0] for e in all_models]
recalls_tuned = [all_models[e][1] for e in all_models]
print(f'zero-shot --> Média: {round(np.average(recalls_zero)*100,2)}. Desvio Padrão: {np.std(recalls_zero)}')
print(f'finetuning --> Média: {round(np.average(recalls_tuned)*100,2)}. Desvio Padrão: {np.std(recalls_tuned)}')

### Fim

