<a href="https://colab.research.google.com/github/josedossantos10/TuningSentenceModels/blob/main/tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, SentencesDataset, util, SentencesDataset, InputExample, losses, models
from torch.utils.data import DataLoader
from google.colab import drive
import torch
import requests
from itertools import combinations
import numpy as np
import pandas as pd
from tqdm import tqdm
import shutil
import os
import re
from torch import nn


Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_

In [None]:
os.makedirs('models',exist_ok=True)
os.makedirs('data',exist_ok=True)
# drive.mount('/content/drive')

In [None]:
column_data = 'txtEmenta'
# column_data = 'txtIndexacao'
# column_data = 'summary_red_clean'
# column_data = 'imgArquivoTeorPDF'
# column_data = 'artPrimeiro'

In [None]:
def get_file(url, folder='data'):
  r = requests.get(url.split('?').pop(0)+'?download=1')
  d = r.headers['content-disposition']
  fname = re.findall("filename=(.+)", d)[0].replace('"','')
  with open(f'/content/{folder}/{fname}','wb') as f:
    f.write(r.content)
  if '.zip' in fname:
    !unzip '/content/{folder}/{fname}' -d '/content/{folder}/'
    print('Unziped and',end=' ')
  print('Saved '+fname)

In [None]:
get_file('https://ufrpebr-my.sharepoint.com/:x:/g/personal/joseantonio_santos_ufrpe_br/EXl01mMQdhpFu0CTJ6tFjYsBoUTgfjumQzQLA9fGHGm3ZA?e=9cr13l')
get_file('https://ufrpebr-my.sharepoint.com/:x:/g/personal/joseantonio_santos_ufrpe_br/EfYCzAxsnF9Koduq3Pdke1cBoTgz1lj4i2POjt9s0-7zgA?e=MGsZTT')
get_file('https://ufrpebr-my.sharepoint.com/:x:/g/personal/joseantonio_santos_ufrpe_br/EY3Vpq_oC7BEljHgoDCDAWUBuEu4UBMI-HGLK-TrPJdvbA?e=5bIflO')
# get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/EQuYJnSjStVPmJ6Ec9-9Z3YBO32NpcNcCbbK0fad7NWTgw?download=1','models')


Saved base_20230428_douglas-bill_corpus.csv
Saved base_20230428_douglas-pair_bill_corpus-2.csv
Saved dados-conle-anonimizado-job_request.csv


In [None]:
def build_model(version, mode=False):
    # if mode:
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        if version=='sbert':
            word_embedding_model = models.Transformer("neuralmind/bert-large-portuguese-cased")
            word_embedding_model.max_seq_length=512
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=True,
                                        pooling_mode_cls_token=False,
                                        pooling_mode_max_tokens=False)
        # elif version=='lbert':
        #     word_embedding_model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")[0]
        #     pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
        #                                 pooling_mode_mean_tokens=True,
        #                                 pooling_mode_cls_token=False,
        #                                 pooling_mode_max_tokens=False)
        elif version=='lbert':
            word_embedding_model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")[0]
            word_embedding_model.max_seq_length=512
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=True,
                                        pooling_mode_cls_token=False,
                                        pooling_mode_max_tokens=False)
        elif version=='labse':
            word_embedding_model = models.Transformer("sentence-transformers/LabSE")
            word_embedding_model.max_seq_length=512
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=False,
                                        pooling_mode_cls_token=True,
                                        pooling_mode_max_tokens=False)
        elif version=='lbt':
            word_embedding_model = models.Transformer("rufimelo/Legal-BERTimbau-large")
            word_embedding_model.max_seq_length=512
            pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                        pooling_mode_mean_tokens=True,
                                        pooling_mode_cls_token=False,
                                        pooling_mode_max_tokens=False)
        else:
            return SentenceTransformer(f'./models/{version}/')

        # Apply mean pooling to get one fixed sized sentence vector
        #pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False)
        #dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=pooling_model.get_sentence_embedding_dimension(),
         #                          activation_function=nn.Tanh())
        #norm = models.Normalize()
        dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), out_features=768, activation_function=nn.Tanh())

        model = SentenceTransformer(modules=[word_embedding_model, pooling_model])#, dense_model,norm])

        return model

    # Building dataloader and trianing model
    # if version=='sbert':
    #     model = SentenceTransformer("neuralmind/bert-large-portuguese-cased")
    # elif version=='lbert':
    #     model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")
    # elif version=='lbertt':
    #     model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")
    # elif version=='albert':
    #     model = SentenceTransformer("PORTULAN/albertina-ptbr")
    # elif version=='allbert':
    #     model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    # elif version=='disbert':
    #     model = SentenceTransformer("distilroberta-base")
    # elif version=='labse':
    #     model = SentenceTransformer("sentence-transformers/LabSE")
    #     model.max_seq_length=512
    # elif version=='lbt':
    #     model = SentenceTransformer("rufimelo/Legal-BERTimbau-large")
    # else:
    #     model = SentenceTransformer(f"./models/{version}/")
    # return model

def finetuning(version, train_dataset, epochs=1, batch_size=1):
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    # Building dataloader and trianing model
    model = build_model(version, True)
    train_loss = losses.ContrastiveLoss(model)
    # train_loss = losses.CosineSimilarityLoss(model)
    # train_loss = losses.MultipleNegativesRankingLoss(model)
    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs, warmup_steps=100,show_progress_bar=True)#, checkpoint_path=f"./../models/{column_data}/checkpoint_{version}", output_path=f"./../models/{column_data}/model_{version}", save_best_model=True )
    return model
    # if version=='sbert':
    #     model.save(f"./../models/{str(column_data)}/sentence_bert_tuned")
    # elif version=='lbert':
    #     model.save(f"./../models/{str(column_data)}/legal_bert_tuned")
def get_name(df, doc, in_field="content", out_field = "name"):
    return str(df[df[in_field]==doc][out_field].to_numpy()[0]).strip()
def recall(vector):
    return sum([1 for (target, docs) in vector if target in docs])/len(vector)
def evaluate(model, top_k,corpus):
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    res_vector = list()
    for query in queries:
        try:
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
            top_results = torch.topk(cos_scores, k=top_k)

            if torch.cuda.is_available():
                top_resuts_txt = [get_name(df, txt, in_field=column_data, out_field="txtNome").strip() for txt in corpus[top_results[1].cpu().numpy()]]
            else:
                top_resuts_txt = [get_name(df, txt, in_field=column_data, out_field="txtNome").strip() for txt in corpus[top_results[1].numpy()]]

            res_vector.append([get_name(df_assunto, query, "TxtAssunto", "NÚMERO-PROPOSIÇÃOSILEG").strip(), top_resuts_txt])
        except Exception as e:
            print(f"ERRO ao realizar encoding:{e}")
            pass
    return recall(res_vector)

In [None]:
read_df = pd.read_csv(f'/content/data/base_20230428_douglas-pair_bill_corpus-2.csv').dropna().reset_index(drop=True)
read_df['label'] = read_df.label.apply(int)
train_dataset = read_df.apply(lambda x: InputExample(texts=(x[f'{column_data}_1'], x[f'{column_data}_2']), label=x['label']), axis=1)
print(len(train_dataset))
read_df[:3]

18534


Unnamed: 0,label,codProposicao_1,txtEmenta_1,txtIndexacao_1,txtInteiroTeor_1,txtInteiroTeorLimpo_1,codProposicao_2,txtEmenta_2,txtIndexacao_2,txtInteiroTeor_2,txtInteiroTeorLimpo_2
0,1,26624,Concede isenção de pagamento de pedágio para o...,"Alteração, decreto-lei federal, isenção, pagam...",8076484952461001119923552 \nPROJETO DE LEI Nº ...,8076484952461001119923552 PROJETO DE LEI Nº DE...,26624,Concede isenção de pagamento de pedágio para o...,"Alteração, decreto-lei federal, isenção, pagam...",8076484952461001119923552 \nPROJETO DE LEI Nº ...,8076484952461001119923552 PROJETO DE LEI Nº DE...
1,1,28414,Dispõe sobre a proibição ao descarte de embriõ...,"Proibição, descarte, embrião, inseminação arti...","PROJETO DE LEI Nº , DE 2001 \...",PROJETO DE LEI Nº DE 2001 Do Sr. Lamartine Pos...,28414,Dispõe sobre a proibição ao descarte de embriõ...,"Proibição, descarte, embrião, inseminação arti...","PROJETO DE LEI Nº , DE 2001 \...",PROJETO DE LEI Nº DE 2001 Do Sr. Lamartine Pos...
2,1,29532,Altera o inciso XIV do art. 6º da Lei nº 7.713...,"Alteração,Legislação Tributária Federal, isenç...","PROJETO DE LEI N° , DE 2001 ...",PROJETO DE LEI N DE 2001 Do Sr. FEU ROSA Alter...,29532,Altera o inciso XIV do art. 6º da Lei nº 7.713...,"Alteração,Legislação Tributária Federal, isenç...","PROJETO DE LEI N° , DE 2001 ...",PROJETO DE LEI N DE 2001 Do Sr. FEU ROSA Alter...


In [None]:
df = pd.read_csv("/content/data/base_20230428_douglas-bill_corpus.csv")
df_assunto = pd.read_csv("/content/data/dados-conle-anonimizado-job_request.csv", encoding="utf-8")
print(df.shape)
df[:3]

(56603, 10)


Unnamed: 0,codProposicao,txtSiglaTipo,numAno,numNumero,txtNome,txtEmenta,txtExplicacaoEmenta,txtIndexacao,txtInteiroTeor,txtInteiroTeorLimpo
0,16357,PL,1999,1165,PL 1165/1999,"Altera dispositivo da Lei nº 8.987, de 13 de f...",Estabelece que as concessionárias disponibiliz...,"Alteração, Lei das Concessões de Serviços Públ...",Ofício nº 1416 (SF) ...,"Ofício nº 1416 SF Brasília, em 17 de julho de ..."
1,19098,PL,1992,3097,PL 3097/1992,Dispõe sobre a eleição de diretores de fundos ...,,"NORMAS, ELEIÇÃO DIRETA, EMPREGADO, APOSENTADO,...",COMISSÃO DE CONSTITUIÇÃO E JUSTIÇA E DE REDAÇÃ...,COMISSÃO DE CONSTITUIÇÃO E JUSTIÇA E DE REDAÇÃ...
2,20464,PL,2000,3927,PL 3927/2000,Altera a composição dos Tribunais Regionais do...,"Altera a composição do TRT da 5ª região, 6ª re...","Alteração, Lei Federal, composição, Tribunal R...","COMISSÃO DE TRABALHO, DE ADMINISTRAÇÃO E SERVI...","COMISSÃO DE TRABALHO, DE ADMINISTRAÇÃO E SERVI..."


In [None]:
print(df.dropna(subset=['txtIndexacao']).shape)#(inplace=True)
df.shape

(56603, 10)


(56603, 10)

In [None]:
queries = df_assunto["TxtAssunto"].to_numpy()
corpus = df[column_data].to_numpy()

In [None]:
column_data

'txtEmenta'

In [None]:
len(queries)

295

In [None]:
top_k = 20
runs = 5
epochs = 1
all_models = {}
version = 'lbert'
print(f'{version} untuned Recall: {round(evaluate(build_model(version),top_k,corpus)*100,2)}')

for e in tqdm(range(0, runs)):
    model = finetuning(version, train_dataset, epochs=epochs, batch_size=2)
    result = evaluate(model,top_k,corpus)
    all_models[e] = (model, result)
    tqdm.write(f'{version} tuned Recall:{result}')
recalls = [all_models[e][1] for e in all_models]
print(f'Média: {round(np.average(recalls)*100,2)}')
print(f'Desvio Padrão: {np.std(recalls)}')

lbert untuned Recall: 29.49


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 20%|██        | 1/5 [22:49<1:31:16, 1369.09s/it]

lbert tuned Recall:0.09830508474576272


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 40%|████      | 2/5 [45:51<1:08:50, 1376.72s/it]

lbert tuned Recall:0.10508474576271186


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 60%|██████    | 3/5 [1:08:56<46:01, 1380.63s/it]  

lbert tuned Recall:0.09491525423728814


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 80%|████████  | 4/5 [1:31:56<23:00, 1380.30s/it]

lbert tuned Recall:0.2033898305084746


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

100%|██████████| 5/5 [1:54:55<00:00, 1379.11s/it]

lbert tuned Recall:0.13559322033898305
Média: 12.75
Desvio Padrão: 0.040598793007674217





In [None]:
top_k = 20
runs = 5
epochs = 1
all_models = {}
version = 'sbert'
print(f'{version} untuned Recall: {round(evaluate(build_model(version),top_k,corpus)*100,2)}')

for e in tqdm(range(0, runs)):
    model = finetuning(version, train_dataset, epochs=epochs, batch_size=2)
    result = evaluate(model,top_k,corpus)
    all_models[e] = (model, result)
    tqdm.write(f'{version} tuned Recall:{result}')
recalls = [all_models[e][1] for e in all_models]
print(f'Média: {round(np.average(recalls)*100,2)}')
print(f'Desvio Padrão: {np.std(recalls)}')

Downloading (…)okenizer_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

sbert untuned Recall: 21.69


  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 20%|██        | 1/5 [1:02:21<4:09:24, 3741.04s/it]

sbert tuned Recall:0.0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 40%|████      | 2/5 [2:04:12<3:06:10, 3723.46s/it]

sbert tuned Recall:0.0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 60%|██████    | 3/5 [3:06:19<2:04:10, 3725.44s/it]

sbert tuned Recall:0.0


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9267 [00:00<?, ?it/s]

 60%|██████    | 3/5 [3:12:03<2:08:02, 3841.03s/it]


OutOfMemoryError: ignored

In [None]:
top_k = 20
itera = 1
epochs = 2
all_models = {}
version = 'sbert'
print(f'{version} untuned Recall: {round(evaluate(build_model(version),top_k,corpus)*100,2)}')
for e in tqdm(range(0, itera)):
    model = finetuning(version,train_dataset, epochs=epochs)
    result = evaluate(model,top_k,corpus)
    all_models[e] = (model, result)
    tqdm.write(f'{version} tuned Recall:{result}')
recalls = [all_models[e][1] for e in all_models]
print(f'Média: {round(np.average(recalls)*100,2)}')
print(f'Desvio Padrão: {np.std(recalls)}')



sbert untuned Recall: 29.28


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4327 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4327 [00:00<?, ?it/s]

100%|██████████| 1/1 [45:43<00:00, 2743.43s/it]

sbert tuned Recall:0.5414364640883977
Média: 54.14
Desvio Padrão: 0.0





In [None]:
top_k = 20
itera = 1
epochs = 3
all_models = {}
version = 'sbert'
print(f'{version} untuned Recall: {round(evaluate(build_model(version),top_k,corpus)*100,2)}')
for e in tqdm(range(0, itera)):
    model = finetuning(version,train_dataset, epochs=epochs)
    result = evaluate(model,top_k,corpus)
    all_models[e] = (model, result)
    tqdm.write(f'{version} tuned Recall:{result}')
recalls = [all_models[e][1] for e in all_models]
print(f'Média: {round(np.average(recalls)*100,2)}')
print(f'Desvio Padrão: {np.std(recalls)}')



sbert untuned Recall: 29.28


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4327 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4327 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4327 [00:00<?, ?it/s]

100%|██████████| 1/1 [1:06:05<00:00, 3965.42s/it]

sbert tuned Recall:0.0055248618784530384
Média: 0.55
Desvio Padrão: 0.0





In [None]:
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/EboMgmFs7QxDiVYIZGVJQIQBTxTsRW6iWkNjCXu7k7_iJA?e=5apeJT','models')

Archive:  /content/models/triplets.zip
   creating: /content/models/triplets/1_Pooling/
  inflating: /content/models/triplets/1_Pooling/config.json  
   creating: /content/models/triplets/2_Dense/
  inflating: /content/models/triplets/2_Dense/config.json  
  inflating: /content/models/triplets/2_Dense/pytorch_model.bin  
   creating: /content/models/triplets/3_Normalize/
  inflating: /content/models/triplets/config.json  
  inflating: /content/models/triplets/config_sentence_transformers.json  
  inflating: /content/models/triplets/modules.json  
  inflating: /content/models/triplets/pytorch_model.bin  
  inflating: /content/models/triplets/README.md  
  inflating: /content/models/triplets/sentence_bert_config.json  
  inflating: /content/models/triplets/special_tokens_map.json  
  inflating: /content/models/triplets/tokenizer.json  
  inflating: /content/models/triplets/tokenizer_config.json  
  inflating: /content/models/triplets/vocab.txt  
Unziped and Saved triplets.zip


In [None]:
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/ESvUmdQHa4pAuP20ILdXvIkBUNXy917ZQqi0FLEgvWoz3Q?e=tM2Gjh',
         'models')

Archive:  /content/models/Model_A.zip
   creating: /content/models/Model_A/1_Pooling/
  inflating: /content/models/Model_A/1_Pooling/config.json  
   creating: /content/models/Model_A/2_Dense/
  inflating: /content/models/Model_A/2_Dense/config.json  
  inflating: /content/models/Model_A/2_Dense/pytorch_model.bin  
   creating: /content/models/Model_A/3_Normalize/
  inflating: /content/models/Model_A/config.json  
  inflating: /content/models/Model_A/config_sentence_transformers.json  
   creating: /content/models/Model_A/eval/
  inflating: /content/models/Model_A/eval/similarity_evaluation_results.csv  
  inflating: /content/models/Model_A/modules.json  
  inflating: /content/models/Model_A/pytorch_model.bin  
  inflating: /content/models/Model_A/README.md  
  inflating: /content/models/Model_A/sentence_bert_config.json  
  inflating: /content/models/Model_A/special_tokens_map.json  
  inflating: /content/models/Model_A/tokenizer.json  
  inflating: /content/models/Model_A/tokenizer_c

In [None]:
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/EfJtKjqIQ7ZNsXgG941vn14BWa-dUuDwQtk0YfQhkIbmLA?e=eAlzQp',
         'models')


Archive:  /content/models/lr56_random_double.zip
   creating: /content/models/lr56_random_double/1_Pooling/
  inflating: /content/models/lr56_random_double/1_Pooling/config.json  
   creating: /content/models/lr56_random_double/2_Dense/
  inflating: /content/models/lr56_random_double/2_Dense/config.json  
  inflating: /content/models/lr56_random_double/2_Dense/pytorch_model.bin  
   creating: /content/models/lr56_random_double/3_Normalize/
  inflating: /content/models/lr56_random_double/config.json  
  inflating: /content/models/lr56_random_double/config_sentence_transformers.json  
   creating: /content/models/lr56_random_double/eval/
  inflating: /content/models/lr56_random_double/eval/similarity_evaluation_results.csv  
  inflating: /content/models/lr56_random_double/modules.json  
  inflating: /content/models/lr56_random_double/pytorch_model.bin  
  inflating: /content/models/lr56_random_double/README.md  
  inflating: /content/models/lr56_random_double/sentence_bert_config.json  


In [None]:
import gdown

url = 'https://drive.google.com/u/1/uc?id=1f4pk6S1mAJyuRUg9wwhCbF5QfiJiHLqg&export=download'
output = 'models/lr56rd.zip'
gdown.download(url, output, quiet=False)
!unzip models/lr56rd.zip

Access denied with the following error:
unzip:  cannot find or open models/lr56rd.zip, models/lr56rd.zip.zip or models/lr56rd.zip.ZIP.



 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/u/1/uc?id=1f4pk6S1mAJyuRUg9wwhCbF5QfiJiHLqg&export=download 



In [None]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 1024, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

Archive:  /content/models/Model_A.zip
caution: filename not matched:  /content/models/


In [None]:
top_k = 20
itera = 1
epochs = 1
all_models = {}
version = 'labse'
print(f'{version} untuned Recall: {round(evaluate(build_model(version),top_k,corpus)*100,2)}')
for e in tqdm(range(0, itera)):
    model = finetuning(version,train_dataset, epochs=epochs)
    result = evaluate(model,top_k,corpus)
    all_models[e] = (model, result)
    tqdm.write(f'{version} tuned Recall:{result}')
recalls = [all_models[e][1] for e in all_models]
print(f'Média: {round(np.average(recalls)*100,2)}')
print(f'Desvio Padrão: {np.std(recalls)}')

labse untuned Recall: 45.86


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4577 [00:00<?, ?it/s]

In [None]:
print(dd)

In [None]:
SentenceTransformer("rufimelo/Legal-BERTimbau-large")

In [None]:
SentenceTransformer("rufimelo/Legal-BERTimbau-sts-base-ma-v2")

Downloading (…)b338f/.gitattributes:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)9f605b338f/README.md:   0%|          | 0.00/7.27k [00:00<?, ?B/s]

Downloading (…)605b338f/config.json:   0%|          | 0.00/898 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)v.tsv.gz_results.csv:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

Downloading (…)n-en.txt_results.csv:   0%|          | 0.00/8.05k [00:00<?, ?B/s]

Downloading (…)v.tsv.gz_results.csv:   0%|          | 0.00/963 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)b338f/tokenizer.json:   0%|          | 0.00/678k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

Downloading (…)9f605b338f/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)05b338f/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [None]:
SentenceTransformer('embedding-data/deberta-sentence-transformer')

In [None]:
SentenceTransformer("distilroberta-base")

In [None]:
SentenceTransformer("neuralmind/bert-base-portuguese-cased")

In [None]:
SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [None]:
column_data = 'txtEmenta'
print(f"{version} untuned Recall: {evaluate(all_models[0][0],top_k,df[column_data].to_numpy())}")

In [None]:
column_data = 'imgArquivoTeorPDF'
print(f"{version} untuned Recall: {evaluate(all_models[0][0],top_k,df['imgArquivoTeorPDF'].to_numpy())}")

In [None]:
top_k = 20
itera = 1
epochs = 2
all_models = {}
version = 'sbert'

print(f'{version} untuned Recall: {evaluate(build_model(version),top_k,corpus)}')
for e in tqdm(range(0, itera)):
    model = finetuning(version,train_dataset, epochs=epochs)
    result = evaluate(model,top_k,corpus)
    all_models[e] = (model, result)
    tqdm.write(f'{version} tuned Recall:{result}')
recalls = [all_models[e][1] for e in all_models]
print(f'Média: {np.average(recalls)}')
print(f'Desvio Padrão: {np.std(recalls)}')

# Old

In [None]:

# lbert - to ulysses legal bert
# sbert  - to bertimbau
#version= 'lbert'
level = 5
mode = False
epochs = 1
device = torch.device("cuda")
column_data = 'txtEmenta'
#column_data2 = 'txtIndexacao'
versions = {#'txtEmenta+txtIndexacao':['lr56_random_double','lbert', 'triplets','Model_A','Model_B','Model_C','Model_D'],
            'txtEmenta':['albert'],
            #'imgArquivoTeorPDF':['lr56_random_double', 'triplets','Model_A','Model_B','Model_C','Model_D'],
            #'txtIndexacao':['lr56_random_double', 'triplets','Model_A','Model_B','Model_C','Model_D']
            }
names = {'lbert':'legal_bert','sbert':'sentence_bert', 'abert':'Model_A', 'bbert':'Model_B','cbert':'Model_C','dbert':'Model_D', 'nadia':'LegalBERTPTbr',
          'lr56':'lr56','triplets':'triplets','lr56f':'ulysses_sbert_finetuned_lr56','tripletsf':'ulysses_sbert_finetuned_lr56_triplets',
          'lr56rd':'lr56_random_double', 'albert':'albertina_ptbr'}
#model = SentenceTransformer("checkpoint/1000")

tree = pd.read_csv("/content/data/Ulysses-RFCorpus/base_feedback_proposicoes_limpa.csv", encoding="utf-8")
df_corpus = pd.read_csv("/content/data/Ulysses-RFCorpus/proposicoes_sem_12.csv", encoding="utf-8").rename(columns={'txt_ementa':column_data})


In [None]:
def build_model(version, mode=False):
    if mode:
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        if version=='sbert':
            word_embedding_model = models.Transformer("neuralmind/bert-base-portuguese-cased")
        elif version=='lbert':
            word_embedding_model = models.Transformer("ulysses-camara/legal-bert-pt-br")
        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                    pooling_mode_mean_tokens=True,
                                    pooling_mode_cls_token=False,
                                    pooling_mode_max_tokens=False)

        return SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Building dataloader and trianing model
    if version=='sbert':
        model = SentenceTransformer("neuralmind/bert-base-portuguese-cased",device=device)
    elif version=='lbert':
        model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")
    elif version=='albert':
        model = SentenceTransformer("PORTULAN/albertina-ptbr")
    else:
        model = SentenceTransformer(f"./models/{version}/")
    return model

In [None]:
positives = []
negatives = []
for i in tqdm(range(len(tree)//2)):
    data = eval(tree['user_feedback'][i])
    permsList = [(e['id'], e['class']) for e in data]
    permsList = list(combinations(permsList,2))
    permsList = [e for e in permsList if (e[0][1]=='r' or e[1][1]=='r')]
    for e in permsList:
        txt1 = str(df_corpus[df_corpus['name']==e[0][0]][column_data].to_numpy()[0]).strip().rstrip()
        txt2 = str(df_corpus[df_corpus['name']==e[1][0]][column_data].to_numpy()[0]).strip().rstrip()
        if e[0][1]==e[1][1]:
            positives.append((txt1, txt2))
            # positives.append((txt1[:min(len(txt1),512)], txt2[:min(len(txt2), 512)]))
        else:
            negatives.append((txt1, txt2))
            # negatives.append((txt1[:min(len(txt1),512)], txt2[:min(len(txt2), 512)]))


negatives = np.array(negatives[:min(len(positives),len(negatives))])
positives = np.array(positives)

print(f"\npositives {len(positives)}")
print(f"\nnegatives {len(negatives)}")

train_dataset = list()
for x in positives:
    train_dataset.append(InputExample(texts=x, label=1.0))
for x in negatives:
    train_dataset.append(InputExample(texts=x, label=0.0))

In [None]:
for version in versions[column_data]:
    # print(f'Training model to {column_data}/{version}_tuned',end='\t')
    # Building dataloader and trianing model
    model = build_model(version, mode)

    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)
    train_loss = losses.CosineSimilarityLoss(model)

    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs, warmup_steps=100,weight_decay=0.0,
              checkpoint_path=f"/content/{column_data}/checkpoint_{version}", output_path=f"/content/{column_data}/model_{version}",
              save_best_model=True )

    if version=='sbert':
        model.save(f"/content/{column_data}/sentence_bert_tuned_{epochs}")
    elif version=='lbert':
        model.save(f"/content/{column_data}/legal_bert_tuned_{epochs}")
    elif version=='albert':
        model.save(f"/content/{column_data}/{version}_tuned_{epochs}")
    else:
        model.save(f"/content/{column_data}/{version}_tuned_{epochs}")
    print(f"Model saved in {column_data}/{version}_tuned_{epochs}.")
    shutil.rmtree(f"/content/{column_data}/checkpoint_{version}", ignore_errors=True)
    !zip -r /content/{names[version]}_tuned_{epochs}.zip /content/{column_data}/{names[version]}_tuned_{epochs}
    !cp -rnv '/content/{names[version]}_tuned_{epochs}.zip' '/content/drive/MyDrive/Backup2'
    shutil.rmtree(f"/content/{column_data}/{names[version]}_tuned_{epochs}", ignore_errors=True)

In [None]:
\versions = {'txtEmenta':['lbert']}

In [None]:
version = 'sbert'
shutil.rmtree(f"/content/{column_data}/checkpoint_{version}", ignore_errors=True)
!zip -r /content/{names[version]}_tuned.zip /content/{column_data}/{names[version]}_tuned
!cp -rnv '/content/{names[version]}_tuned.zip' '/content/drive/MyDrive/Backup2'
shutil.rmtree(f"/content/{column_data}/{names[version]}_tuned", ignore_errors=True)

In [None]:
print('OK')

# Resto vvv

In [None]:
names = {'lbert':'legal_bert','sbert':'sentence_bert', 'abert':'Model_A', 'bbert':'Model_B','cbert':'Model_C','dbert':'Model_D', 'nadia':'LegalBERTPTbr',
          'lr56':'lr56','triplets':'triplets','lr56f':'ulysses_sbert_finetuned_lr56','tripletsf':'ulysses_sbert_finetuned_lr56_triplets',
          'lr56rd':'lr56_random_double'}
version='lbert'
!zip -r /content/models2/{names[version]}_tuned.zip /content/models2/{column_data}/{names[version]}_tuned
!cp -rnv '/content/models2/{names[version]}_tuned.zip' '/content/drive/MyDrive/Backup2'
shutil.rmtree(f"/content/models2/{column_data}/{names[version]}_tuned", ignore_errors=True)

In [None]:
!cp -rnv '/content/drive/MyDrive/Ulysses_LaBSE_finetuned/ulysses_sbert_finetuned_lr56_triplets.zip'  '/content/models'
!unzip '/content/models/ulysses_sbert_finetuned_lr56_triplets.zip'

In [None]:
!zip -r legal_bert_tuned.zip /content/models/txtEmenta+txtIndexacao/legal_bert_tuned/


In [None]:
import gdown

url = 'https://drive.google.com/u/1/uc?id=1f4pk6S1mAJyuRUg9wwhCbF5QfiJiHLqg&export=download'
output = 'models/lr56rd.zip'
gdown.download(url, output, quiet=False)
!unzip models/lr56rd.zip