In [None]:
!pip install git+https://github.com/vasudevgupta7/gsoc-wav2vec2@main
!pip install pydub
!pip install datasets

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds
from wav2vec2 import Wav2Vec2Config
import numpy as np
import matplotlib.pyplot as plt
import librosa
from datasets import load_metric
config = Wav2Vec2Config()

In [None]:
dataset = tfds.load('spoken_digit', split='train', shuffle_files=True)

In [None]:
ds1=dataset.take(3)
for i in ds1:
  print(list(i.keys()))
  audio = i['audio']
  audio_filename = i['audio/filename']
  label = i['label']
  print(audio.shape, audio_filename, label)


In [None]:
df=tfds.as_dataframe(dataset)
print(df.shape)
df.head()
type(df.iloc[:,0])
dfx=df['audio']
dfy=df['label']
dsx = dfx.to_numpy(dfx) 
dsy = dfy.astype(str)
print(len(dsy))

In [None]:
def resample(audio):
  rs=librosa.resample(audio.astype(np.float64), orig_sr=8000, target_sr=16000)
  return rs

dsx_iter = map(resample, dsx)
dsx=list(dsx_iter)
ds = [(dsx[i], dsy[i]) for i in range(len(dsx)) if len(dsx) < AUDIO_MAXLEN]

In [None]:
from wav2vec2 import Wav2Vec2Processor
tokenizer = Wav2Vec2Processor(is_tokenizer=True)
processor = Wav2Vec2Processor(is_tokenizer=False)

def preprocess_text(text):
  label = tokenizer(text)
  return tf.constant(label, dtype=tf.int32)

def preprocess_speech(audio):
  audio = tf.constant(audio, dtype=tf.float32)
  return processor(tf.transpose(audio))

def inputs_generator():
  for speech, text in ds:
    yield preprocess_speech(speech), preprocess_text(text)

output_signature = (
    tf.TensorSpec(shape=(None),  dtype=tf.float32),
    tf.TensorSpec(shape=(None), dtype=tf.int32),
)

w2v2_ds = tf.data.Dataset.from_generator(inputs_generator, output_signature=output_signature)

In [None]:
AUDIO_MAXLEN = 246000
LABEL_MAXLEN = 256
BATCH_SIZE = 2
#AUDIO_MAXLEN = 36600
#LABEL_MAXLEN = 16
#BATCH_SIZE = 1

In [None]:
BUFFER_SIZE = len(ds)
SEED = 42
w2v2_ds = w2v2_ds.shuffle(BUFFER_SIZE, seed=SEED)
w2v2_ds = w2v2_ds.padded_batch(BATCH_SIZE, padded_shapes=(AUDIO_MAXLEN, LABEL_MAXLEN), padding_values=(0.0, 0))
#w2v2_ds = w2v2_ds.prefetch(tf.data.AUTOTUNE)

In [None]:
train_ds = w2v2_ds.take(2000)
val_ds = w2v2_ds.skip(2000)

In [None]:
pretrained_layer = hub.KerasLayer("https://tfhub.dev/vasudevgupta7/wav2vec2/1", trainable=True)

In [None]:
inputs = tf.keras.Input(shape=(AUDIO_MAXLEN,))
hidden_states = pretrained_layer(inputs)
outputs = tf.keras.layers.Dense(config.vocab_size)(hidden_states)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [None]:
model(tf.random.uniform(shape=(BATCH_SIZE, AUDIO_MAXLEN)))
model.summary()

In [None]:
from wav2vec2 import CTCLoss

LEARNING_RATE = 5e-5

loss_fn = CTCLoss(config, (BATCH_SIZE, AUDIO_MAXLEN), division_factor=BATCH_SIZE)
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)

In [None]:
model.compile(optimizer, loss=loss_fn)
history = model.fit(train_ds, validation_data=val_ds, epochs=3)
history.history

In [None]:
metric = load_metric('wer')
@tf.function(jit_compile=True)
def eval_fwd(batch):
  logits = model(batch, training=False)
  return tf.argmax(logits, axis=-1)

from tqdm.auto import tqdm
for speech, labels in tqdm(val_ds, total=500):
    predictions  = eval_fwd(speech)
    predictions = [tokenizer.decode(pred) for pred in predictions.numpy().tolist()]
    references = [tokenizer.decode(label, group_tokens=False) for label in labels.numpy().tolist()]
    metric.add_batch(references=references, predictions=predictions)
metric.compute()

In [None]:
# finetuned_model = tf.keras.models.load_model(save_dir)

In [None]:
"""
def extract_characters(batch):
  texts = " ".join(batch["text"])
  vocab = list(set(texts))
  return {"vocab": [vocab], "texts": [texts]}

vocabs = ds.map(extract_characters, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=ds.column_names["train"])

vocab_list = list(set(vocabs["train"]["vocab"][0]) | set(vocabs["test"]["vocab"][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

vocab_dict["[UNK]"] = len(vocab_dict) # add "unknown" token 
vocab_dict["[PAD]"] = len(vocab_dict) # add a padding token that corresponds to CTC's "blank token"

with open('vocab.json', 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [None]:
"""
# create Wav2Vec2 tokenizer
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]",
                                  pad_token="[PAD]", word_delimiter_token="|")

# create Wav2Vec2 feature extractor
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, 
                                             padding_value=0.0, do_normalize=True, return_attention_mask=False)
# create a processor pipeline 
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [None]:
"""
# extract the numerical representation from the dataset
def extract_array_samplingrate(batch):
    batch["speech"] = batch['audio']['array'].tolist()
    batch["sampling_rate"] = batch['audio']['sampling_rate']
    batch["target_text"] = batch["text"]
    return batch

dataset = ds.map(extract_array_samplingrate, remove_columns=ds.column_names["train"])

# process the dataset with processor pipeline that created above
def process_dataset(batch):  
    batch["input_values"] = processor(batch["speech"], 
                            sampling_rate=batch["sampling_rate"][0]).input_values

    with processor.as_target_processor():
        batch["labels"] = processor(batch["target_text"]).input_ids
    return batch

data_processed = dataset.map(process_dataset, 
                    remove_columns=dataset.column_names["train"], batch_size=8, 
                    batched=True)

train_dataset = data_processed['train']
test_dataset = data_processed['test']