<a href="https://colab.research.google.com/github/josedossantos10/HIRS/blob/main/Hibrido_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-Tuning e avaliação do modelos

Ajuste os parâmentros baixo e depois execute todo o codigo para avaliar o modelo hibrido.

In [None]:
# Coluna a ser usada para o Fini-Tuning e avaliação dos modelos
# column_data = 'txtEmenta'
# column_data = 'txtIndexacao'
# column_data = 'hibrido'
column_data = 'txtInteiroTeorLimpo'

# Modelos de sentecnça para realizar o Fine-Tuning: 'lbert', 'labse', 'lbt', 'raq', veja a função build_model() para consultar os nomes de cada modelo diponível
version = 'bertb'

# Número de documentos a serem recuperados para avaliar os modelos
top_k = 20

# Número de épocas para realizar o Fine-Tuning
epochs = 1

# Número máximo de tokens para cada documento
max_tokens = 512


# Preparação

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer, SentencesDataset, util, SentencesDataset, InputExample, losses, models
from torch.utils.data import DataLoader
from google.colab import drive
import torch
import requests
from itertools import combinations
import numpy as np
import pandas as pd
from tqdm import tqdm
import shutil
import os
import re
from torch import nn


Collecting sentence_transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/171.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m163.8/171.5 kB[0m [31m5.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (

In [None]:
os.makedirs('models',exist_ok=True)
os.makedirs('data',exist_ok=True)
# drive.mount('/content/drive')

In [None]:
def get_file(url, folder='data'):
  r = requests.get(url.split('?').pop(0)+'?download=1')
  d = r.headers['content-disposition']
  fname = re.findall("filename=(.+)", d)[0].replace('"','')
  with open(f'/content/{folder}/{fname}','wb') as f:
    f.write(r.content)
  if '.zip' in fname:
    !unzip -o '/content/{folder}/{fname}' -d '/content/{folder}/'
    print('Unziped and',end=' ')
  print('Saved '+fname)

In [None]:
def build_model(version, max_tokens = 512):
      if version=='bertb':
          word_embedding_model = models.Transformer("neuralmind/bert-large-portuguese-cased")
          pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                      pooling_mode_mean_tokens=True,
                                      pooling_mode_cls_token=False,
                                      pooling_mode_max_tokens=False)
      elif version=='lbert':
          word_embedding_model = SentenceTransformer("ulysses-camara/legal-bert-pt-br")[0]
          pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                      pooling_mode_mean_tokens=True,
                                      pooling_mode_cls_token=False,
                                      pooling_mode_max_tokens=False)
      elif version=='labse':
          word_embedding_model = models.Transformer("sentence-transformers/LabSE")
          pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                      pooling_mode_mean_tokens=False,
                                      pooling_mode_cls_token=True,
                                      pooling_mode_max_tokens=False)
      elif version=='lbt':
          word_embedding_model = models.Transformer("rufimelo/Legal-BERTimbau-large")
          pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                      pooling_mode_mean_tokens=True,
                                      pooling_mode_cls_token=False,
                                      pooling_mode_max_tokens=False)
      elif version=='raq':
          word_embedding_model = models.Transformer("raquelsilveira/legalbertpt_fp")
          pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                      pooling_mode_mean_tokens=True,
                                      pooling_mode_cls_token=False,
                                      pooling_mode_max_tokens=False)
      else:
          return SentenceTransformer(f'./models/{version}/')

      word_embedding_model.max_seq_length=max_tokens
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

      return model

def finetuning(version, train_dataset, max_tokens, epochs=1, batch_size=2):
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    # Building dataloader and trianing model
    model = build_model(version, max_tokens, True)
    train_loss = losses.ContrastiveLoss(model)
    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs,show_progress_bar=True)
    return model

def get_name(df, doc, in_field="content", out_field = "name"):
    return str(df[df[in_field]==doc][out_field].to_numpy()[0]).strip()
def recall(vector):
    return sum([1 for (target, docs) in vector if target in docs])/len(vector)
def evaluate(model, top_k,corpus, queries):
    corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
    res_vector = list()
    for query in queries:
        try:
            query_embedding = model.encode(query, convert_to_tensor=True)
            cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
            top_results = torch.topk(cos_scores, k=top_k)

            if torch.cuda.is_available():
                top_resuts_txt = [get_name(df, txt, in_field=column_data, out_field="txtNome").strip() for txt in corpus[top_results[1].cpu().numpy()]]
            else:
                top_resuts_txt = [get_name(df, txt, in_field=column_data, out_field="txtNome").strip() for txt in corpus[top_results[1].numpy()]]

            res_vector.append([get_name(df_assunto, query, "TxtAssunto", "NÚMERO-PROPOSIÇÃOSILEG").strip(), top_resuts_txt])
        except Exception as e:
            print(f"ERRO ao realizar encoding:{e}")
            pass
    return recall(res_vector)

# Obtenção dos dados

In [None]:
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/EQcLlSaOKPtCtzY3juKPkpYB6Rl9Jnp-xNimWaIcJNe_bg?e=jc0ZWR')
get_file('https://ufrpebr-my.sharepoint.com/:x:/g/personal/joseantonio_santos_ufrpe_br/EfciCkFDXkxAoKnvwW-O0FYB1YMf-iXIwD1nVdf0Ve_t8g?e=X1QPUi')
get_file('https://ufrpebr-my.sharepoint.com/:u:/g/personal/joseantonio_santos_ufrpe_br/ESigtraTg-xLjUAwHuEzF0sB-CkOhU75tNSZcomdbuvpxg?e=AcMaXi')

Archive:  /content/data/base_20230428_douglas-bill_corpus-parts.zip
  inflating: /content/data/base_20230428_douglas-bill_corpus-part0.csv  
  inflating: /content/data/base_20230428_douglas-bill_corpus-part1.csv  
  inflating: /content/data/base_20230428_douglas-bill_corpus-part2.csv  
  inflating: /content/data/base_20230428_douglas-bill_corpus-part3.csv  
  inflating: /content/data/base_20230428_douglas-bill_corpus-part4.csv  
Unziped and Saved base_20230428_douglas-bill_corpus-parts.zip
Saved pares-arvores-proposicao-filter.csv
Archive:  /content/data/dados-conle-anonimizado-job_request-parts.zip
  inflating: /content/data/dados-conle-anonimizado-job_request-part0.csv  
  inflating: /content/data/dados-conle-anonimizado-job_request-part1.csv  
  inflating: /content/data/dados-conle-anonimizado-job_request-part2.csv  
  inflating: /content/data/dados-conle-anonimizado-job_request-part3.csv  
  inflating: /content/data/dados-conle-anonimizado-job_request-part4.csv  
Unziped and Saved 

# Treinamento e Avaliação

In [None]:
runs = 5
zero_shot = []
tuned = []

read_df = pd.read_csv(f'/content/data/pares-arvores-proposicao-filter.csv').rename(columns={'imgArquivoTeorPDF_clean_1':'txtInteiroTeorLimpo_1','imgArquivoTeorPDF_clean_2':'txtInteiroTeorLimpo_2'}).dropna().reset_index(drop=True)
read_df['label'] = read_df.label.apply(int)
train_dataset = read_df.apply(lambda x: InputExample(texts=(x[f'{column_data}_1'], x[f'{column_data}_2']), label=x['label']), axis=1)

model_tuned = finetuning(version, train_dataset, max_tokens,epochs=epochs, batch_size=2)
for e in tqdm(range(0, runs)):
    element = f'{column_data}_{version}_{e}'
    df = pd.read_csv(f"/content/data/base_20230428_douglas-bill_corpus-part{i}.csv")
    df_assunto = pd.read_csv(f"/content/data/dados-conle-anonimizado-job_request-part{i}.csv", encoding="utf-8")
    queries = df_assunto["TxtAssunto"].to_numpy()
    corpus = df[column_data].to_numpy()
    zero_shot.append(evaluate(build_model(version, max_tokens),top_k,corpus, queries))
    tuned.append(evaluate(model_tuned,top_k,corpus, queries))
print(f'Zero shot {zero_shot} --> Média: {round(np.average(zero_shot)*100,2)}. Desvio Padrão: {np.std(zero_shot)}')
print(f'Zero shot {tuned} --> Média: {round(np.average(tuned)*100,2)}. Desvio Padrão: {np.std(tuned)}')
