In [None]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/finetune-whisper')

Mounted at /content/drive/


In [None]:
!pip install datasets>=2.6.1
!pip install evaluate>=0.30
!pip install jiwer
!pip install striprtf
!pip install tiktoken
!pip install git+https://github.com/openai/whisper.git
!pip install Unidecode

Collecting jiwer
  Downloading jiwer-3.0.2-py3-none-any.whl (21 kB)
Collecting rapidfuzz==2.13.7 (from jiwer)
  Downloading rapidfuzz-2.13.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, jiwer
Successfully installed jiwer-3.0.2 rapidfuzz-2.13.7
Collecting striprtf
  Downloading striprtf-0.0.25-py3-none-any.whl (7.1 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.25
Collecting tiktoken
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m52.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.4.0
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.

In [None]:
import locale
import os
import re
from striprtf.striprtf import rtf_to_text
from datasets import Dataset

locale.getpreferredencoding = lambda: "UTF-8"

def create_dataset(dir, task):

    diretorio = dir
    padrao_dialogo = re.compile(r'^(\w+):\s(.*)$')

    arquivos_dialogo = {}
    metadata = {
        "audio": [],
        "text": []
    }


    for arquivo in os.listdir(diretorio):

        if arquivo.endswith(".RTF"):
            conversa = ""
            caminho = os.path.join(diretorio, arquivo)

            with open(caminho) as infile:
                rtf_data = infile.read()
                texto = rtf_to_text(rtf_data)

            linhas = texto.split('\n')
            dialogos = [padrao_dialogo.match(linha) for linha in linhas]
            dialogos = [f"{match.group(2)} " for match in dialogos if match]
            conversa += "".join(dialogos)

            arquivo = arquivo.replace("RTF", "WAV")
            arquivos_dialogo["./wav/"+ task + "/" + arquivo] = conversa


    for key, value in arquivos_dialogo.items():

        if value != "":

            metadata["audio"].append(key)
            metadata["text"].append(value)


    audio_dataset = Dataset.from_dict(metadata)

    return audio_dataset




In [None]:
train_dataset = create_dataset("./wav/train", "train")
test_dataset = create_dataset("./wav/test", "test")

In [None]:
from datasets import DatasetDict

sis_dataset = DatasetDict()

sis_dataset["train"] = create_dataset("./wav/train", "train")
sis_dataset["test"] = create_dataset("./wav/test", "test")

print(sis_dataset)

In [None]:
from datasets import Audio

sis_dataset_sr16 = sis_dataset.cast_column("audio", Audio(sampling_rate=16000))
references = sis_dataset_sr16["test"]['text']

In [None]:
import whisper

model = whisper.load_model("large")

predictions = []
for audio in sis_dataset_sr16["test"]['audio']:
  result = model.transcribe(audio['path'], language='pt')
  predictions.append(result['text'])

In [None]:
import re
import unicodedata

def remover_caracteres_especiais(texto):
    texto = re.sub('[^\w\s]', '', texto)
    return texto

def remover_acentos(texto):
    texto = unicodedata.normalize('NFKD', texto).encode('ASCII', 'ignore').decode('utf-8')
    return texto

def converter_para_minusculas(texto):
    texto = texto.lower()
    return texto

def remover_espacos_extras(texto):
    texto = re.sub('\s+', ' ', texto)
    return texto

def remover_digitos(texto):
    texto = re.sub(r'\d+', '', texto)  # Remove dígitos
    return texto

def remover_espacos_inicio_fim(texto):
    texto = texto.strip()  # Remove espaços em branco no início e no fim
    return texto

def normalizar_transcricoes(transcricoes):
    transcricoes_normalizadas = []
    for transcricao in transcricoes:
        texto_normalizado = remover_caracteres_especiais(transcricao)
        texto_normalizado = remover_acentos(texto_normalizado)
        texto_normalizado = converter_para_minusculas(texto_normalizado)
        texto_normalizado = remover_espacos_extras(texto_normalizado)
        texto_normalizado = remover_digitos(texto_normalizado)
        texto_normalizado = remover_espacos_inicio_fim(texto_normalizado)
        transcricoes_normalizadas.append(texto_normalizado)
    return transcricoes_normalizadas


In [None]:
from evaluate import load
wer = load("wer")
wer_score = wer.compute(predictions=normalizar_transcricoes(predictions), references=normalizar_transcricoes(references))
#print(wer_score)
print(f"WER: {wer_score * 100:.2f} %")

WER: 44.05 %


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━[0m [32m5.8/7.2 MB[0m [31m176.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.2/7.2 MB[0m [31m178.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux

In [None]:

from transformers import pipeline

transcriber = pipeline(
  "automatic-speech-recognition",
  model="jonatasgrosman/whisper-large-pt-cv11"
)

transcriber.model.config.forced_decoder_ids = (
  transcriber.tokenizer.get_decoder_prompt_ids(
    language="pt",
    task="transcribe"
  )
)

#transcription = transcriber("path/to/my_audio.wav")


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/832 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

In [None]:
import whisper

#model = whisper.load_model("large")

predictions = []
for audio in sis_dataset_sr16["test"]['audio']:
  result = transcriber(audio['path'])
  predictions.append(result['text'])



In [None]:
from evaluate import load
wer = load("wer")
wer_score = wer.compute(predictions=normalizar_transcricoes(predictions), references=normalizar_transcricoes(references))
#print(wer_score)
print(f"WER: {wer_score * 100:.2f} %")

WER: 88.99 %


In [None]:
import torch
import librosa
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

LANG_ID = "pt"
MODEL_ID = "jonatasgrosman/wav2vec2-xls-r-1b-portuguese"
SAMPLES = 10

test_dataset = sis_dataset_sr16['test']

processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)

# Preprocessing the datasets.
# We need to read the audio files as arrays
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = librosa.load(batch['audio']["path"], sr=16_000)
    batch["speech"] = speech_array
    batch["text"] = batch["text"].upper()
    return batch

test_dataset = test_dataset.map(speech_file_to_array_fn)
inputs = processor(test_dataset["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits

predicted_ids = torch.argmax(logits, dim=-1)
predicted_sentences = processor.batch_decode(predicted_ids)

predictions = predicted_sentences

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
from evaluate import load
wer = load("wer")
wer_score = wer.compute(predictions=normalizar_transcricoes(predictions), references=normalizar_transcricoes(references))
print(f"WER: {wer_score * 100:.2f} %")

WER: 60.35 %
