# Fine-Tuning Wav2Vec2-Large-XLSR-52 on MGB-3
## Part I

In [1]:
import pandas as pd
import numpy as np
import re
import json
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
import torchaudio
from functools import cache
import IPython.display as ipd
import random
import pickle

### Constants

In [2]:
language_code = 'ar'
language_name = 'arabic'
base_model = "facebook/wav2vec2-large-xlsr-53"

train_dir = "./mgb3/test/Omar/"
dev_dir = "./mgb3/dev/Omar/"
test_dir = "./mgb3/adapt/Omar/"
wav_dir = "./mgb3/wav/"

data_dir = "./data/"

columns = ["filename", "start_time", "end_time", "sentence", "phoneme"]

output_models_dir = f"./workspace/output_models/{language_code}/wav2vec2-large-xlsr-{language_name}-demo"
new_output_models_dir = f"./workspace/output_models/{language_code}/wav2vec2-large-xlsr-{language_name}"

## Prepare Data, Tokenizer, Feature Extractor
### Prepare Data

In [3]:
def split(strng, sep, pos):
    strng = strng.split(sep)
    return [sep.join(strng[:pos]), sep.join(strng[pos:])]

def parse_text_lines(lines):
    # split name and time from speech
    lines = [line.split(" ", 1) for line in lines]
    
    # split name and time from each other
    name_time = [split(line[0], "_", 4) for line in lines]

    # split start time from end time
    time_splitted = [nt[1].split('_') for nt in name_time]
    
    for i in range(len(lines)):
        # remove old time
        del name_time[i][1]
        # add splitted time
        name_time[i].append(time_splitted[i][0])
        name_time[i].append(time_splitted[i][1])
        # add speech
        name_time[i].append(lines[i][1])
        
    return name_time

def load_data(data_dir):
    # load speech file name, times and speech
    with open(f"{data_dir}text_noverlap", encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
    lines = parse_text_lines(lines)

    # load phonemes
    with open(f"{data_dir}text_noverlap.bw", encoding='utf-8', errors='ignore') as f:
        lines2 = f.readlines()
    lines2 = [line.split(" ", 1)[1] for line in lines2]
    
    assert len(lines) == len(lines2)
    for i in range(len(lines)):
        lines[i].append(lines2[i])
    return lines

In [4]:
lines = load_data(train_dir)
train = pd.DataFrame(lines, columns=columns)
train.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_09_first_12min,4.754,12.352,سلامات عليكم، حأكلمكو النهارده عن الـ (uncerta...,slAmAt Elykm H>klmkw AlnhArdh En Al @@LAT(unce...
1,comedy_09_first_12min,12.352,19.989,الأول في معضلة لغوية كده عايز أخلص منها الأول،...,Al>wl fy mEDlp lgwyp kdh EAyz >xlS mnhA Al>wl ...
2,comedy_09_first_12min,19.989,28.455,لو حد بص على الترجمة بتاعتها في القاموس حيلاقي...,lw Hd bS ElY Altrjmp btAEthA fy AlqAmws HylAqy...
3,comedy_09_first_12min,28.455,37.159,وأول ملحوظة عايزين نلاحظها على الحكاية دي، إن ...,w>wl mlHwZp EAyzyn nlAHZhA ElY AlHkAyp dy <n m...
4,comedy_09_first_12min,37.159,46.42,الناقص ممكن يبقى واحد ممكن يبقى عشرة ممكن يبقى...,AlnAqS mmkn ybqY wAHd mmkn ybqY E$rp mmkn ybqY...


In [5]:
lines = load_data(dev_dir)
dev = pd.DataFrame(lines, columns=columns)
dev.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_75_first_12min,0.0,8.19,أهلاً وسهلاً، أهلاً وسهلاً ومرحباً بيكم وحلقة ...,>hlA wshlA >hlA wshlA wmrHbA bykm wHlqp jdydp ...
1,comedy_75_first_12min,8.19,16.7,حلقة النهارده بنتكلم فيها عن التحرش\n,Hlqp AlnhArdh bntklm fyhA En AltHr$ \n
2,comedy_75_first_12min,16.7,24.506,يا سلام ع الفرحة يا سلام، طبعاً هو التحرش ده أ...,yA slAm E AlfrHp yA slAm TbEA hw AltHr$ dh >ky...
3,comedy_75_first_12min,24.506,33.824,كان موجود من زمان بس يمكن في الفترة الأخيرة اب...,kAn mwjwd mn zmAn bs ymkn fy Alftrp Al>xyrp Ab...
4,comedy_75_first_12min,33.824,41.227,طب هل المشكلة في اللبس هل المشكلة في الأخلاق ه...,Tb hl Alm$klp fy Allbs hl Alm$klp fy Al>xlAq h...


In [6]:
lines = load_data(test_dir)
test = pd.DataFrame(lines, columns=columns)
test.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_72_first_12min,23.652,31.356,مساء الخير أهلاً ومرحباً بيكم في حلقة جديدة من...,msA' Alxyr >hlA wmrHbA bykm fy Hlqp jdydp mn b...
1,comedy_72_first_12min,31.356,36.621,عشان يظهر موهبته للناس كلها، نفسه بس إن هو ياخ...,E$An yZhr mwhbth llnAs klhA nfsh bs <n hw yAxd...
2,comedy_72_first_12min,36.621,46.166,يمكن إحنا زمان الفرصة أتاحت لنا أتاحت لنا الفر...,ymkn <HnA zmAn AlfrSp >tAHt lnA >tAHt lnA Alfr...
3,comedy_72_first_12min,46.166,51.581,وجه الزمن وجه الوقت وجه المكان اللي نقدم ناس غ...,wjh Alzmn wjh Alwqt wjh AlmkAn Ally nqdm nAs g...
4,comedy_72_first_12min,51.581,59.684,الحمد لله إحنا بعد ما إتشهرنا جداً والناس عرفت...,AlHmd llh <HnA bEd mA <t$hrnA jdA wAlnAs Erftn...


### Create Wav2Vec2CTCTokenizer

In [7]:
chars_to_ignore_regex = '[\,\؟\.\!\-\;\:\'\"\☭\«\»\؛\—\ـ\_\،\“\%\‘\”\�\(\)\#]'

def remove_special_characters(sentence):  
    sentence = re.sub(chars_to_ignore_regex, '', sentence.lower() + " ")
    sentence = re.sub('\n', '', sentence)
    sentence = re.sub('[a-z]','', sentence)
    sentence = re.sub("[إأٱآا]", "ا", sentence)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    sentence = re.sub(noise, '', sentence)
    return sentence   

In [8]:
train['sentence'] = train['sentence'].apply(remove_special_characters)
dev['sentence']   = dev['sentence'].apply(remove_special_characters)
test['sentence']  = test['sentence'].apply(remove_special_characters) 

In [9]:
def join_chars_df_col(df, col):
    all_text = " ".join(df[col])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [10]:
vocab_train = join_chars_df_col(train, 'sentence')
vocab_dev   = join_chars_df_col(dev, 'sentence')
vocab_test  = join_chars_df_col(test, 'sentence')

In [11]:
vocab_list = sorted(list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])))

In [12]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 0,
 '/': 1,
 '2': 2,
 'ء': 3,
 'ؤ': 4,
 'ئ': 5,
 'ا': 6,
 'ب': 7,
 'ة': 8,
 'ت': 9,
 'ث': 10,
 'ج': 11,
 'ح': 12,
 'خ': 13,
 'د': 14,
 'ذ': 15,
 'ر': 16,
 'ز': 17,
 'س': 18,
 'ش': 19,
 'ص': 20,
 'ض': 21,
 'ط': 22,
 'ظ': 23,
 'ع': 24,
 'غ': 25,
 'ف': 26,
 'ق': 27,
 'ك': 28,
 'ل': 29,
 'م': 30,
 'ن': 31,
 'ه': 32,
 'و': 33,
 'ى': 34,
 'ي': 35}

In [13]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [14]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

38

In [15]:
with open(data_dir+"vocab.json", 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [16]:
tokenizer = Wav2Vec2CTCTokenizer(data_dir+"vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")

### Create Wav2Vec2FeatureExtractor

In [17]:
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, 
                                             do_normalize=True, return_attention_mask=True)

### Create Wav2Vec2Processor

In [18]:
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
processor.save_pretrained(new_output_models_dir)

### Data Preprocessing

In [19]:
@cache
def load_wav(path_to_file):
    try:
        return torchaudio.load(path_to_file)
    except RuntimeError as rtm_err:
        return None

def speech_to_array_fn(row):
    filename, start_time, end_time = row.filename, row.start_time, row.end_time
    path_to_file = wav_dir + filename + ".wav"
    
    tmp = load_wav(path_to_file) 
    if tmp is None:
        return None
    
    speech_array, sampling_rate = tmp

    start_sample = int(float(start_time)*sampling_rate)
    end_sample = int(float(end_time)*sampling_rate)

    return speech_array[0][start_sample:end_sample]

In [20]:
def split_df(df, splits):
    step = len(df)//splits
    splits_arr = list()
    for i in range(splits):   
        splits_arr.append(df.iloc[step*(i):step*(i+1), :])
    return splits_arr

In [21]:
# splitting because it froze when approached in bulk
i = 0
train_splits = split_df(train, 10)

for t in train_splits:
    t["speech"] = t.apply(speech_to_array_fn, axis=1)
    print(f"Finished processing t{i}")
    i += 1 
train = pd.concat(train_splits)
train.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  t["speech"] = t.apply(speech_to_array_fn, axis=1)


Finished processing t0
Finished processing t1
Finished processing t2
Finished processing t3
Finished processing t4
Finished processing t5
Finished processing t6
Finished processing t7
Finished processing t8
Finished processing t9


In [22]:
dev["speech"] = dev.apply(speech_to_array_fn, axis=1)
dev.dropna(inplace=True)

In [23]:
test["speech"] = test.apply(speech_to_array_fn, axis=1)
test.dropna(inplace=True)

In [24]:
train.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme,speech
86,comedy_78_first_12min,0.0,9.675,عزيزي المشاهد البرنامج ده مش برنامج كوميدي وبس,Ezyzy Alm$Ahd AlbrnAmj dh m$ brnAmj kwmydy wbs...,"[tensor(0.), tensor(0.), tensor(0.), tensor(3...."
87,comedy_78_first_12min,9.675,19.343,ده حالة انسانية استضفنا في البرنامج ده ناس كتي...,dh HAlp <nsAnyp AstDfnA fy AlbrnAmj dh nAs kty...,"[tensor(0.0029), tensor(0.0040), tensor(0.0028..."
88,comedy_78_first_12min,19.343,28.335,اضطرينا نعرضهم لضغوط كتير زي ما بيحصل في الواق...,ADTrynA nErDhm lDgwT ktyr zy mA byHSl fy AlwAq...,"[tensor(0.0386), tensor(0.0294), tensor(0.0154..."
89,comedy_78_first_12min,99.444,109.108,بصوا يا جماعة احنا عمالين نجيب اشكال والوان بس...,bSwA yA jmAEp <HnA EmAlyn njyb >$kAl w>lwAn bs...,"[tensor(-0.0039), tensor(-0.0038), tensor(-0.0..."
90,comedy_78_first_12min,119.174,128.396,مراته دكتورة صيدلانية وعندها صيدلية في المعادي...,mrAth dktwrp SydlAnyp wEndhA Sydlyp fy AlmEAdy...,"[tensor(0.0020), tensor(0.0065), tensor(0.0056..."


In [25]:
train.to_csv(f"{data_dir}train.csv")
dev.to_csv(f"{data_dir}dev.csv")
test.to_csv(f"{data_dir}test.csv")

In [26]:
rand_int = random.randint(0, len(train)-1)
ipd.Audio(data=np.asarray(train.iloc[rand_int]["speech"]), autoplay=True, rate=16000)

In [27]:
def prepare_dataset(row):
    audio = row["speech"]

    # batched output is "un-batched"
    row["input_values"] = processor(audio, sampling_rate=16_000).input_values[0]
    row["input_length"] = len(row["input_values"])
    
    with processor.as_target_processor():
        row["labels"] = processor(row["sentence"]).input_ids
    return row

In [28]:
train = train.apply(prepare_dataset, axis=1)
dev   = dev.apply(prepare_dataset, axis=1)
test  = test.apply(prepare_dataset, axis=1)

In [29]:
feature_names = ["input_values", "input_length", "labels"]
train = train[feature_names]
dev   = dev[feature_names]
test  = test[feature_names]

In [30]:
train.head()

Unnamed: 0,input_values,input_length,labels
86,"[-0.0035943196, -0.0035943196, -0.0035943196, ...",154800,"[24, 17, 35, 17, 35, 0, 6, 29, 30, 19, 6, 32, ..."
87,"[0.011993948, 0.01705188, 0.01185345, 0.017473...",154688,"[14, 32, 0, 12, 6, 29, 8, 0, 6, 31, 18, 6, 31,..."
88,"[0.14678994, 0.1116823, 0.05755315, 0.00178016...",143872,"[6, 21, 22, 16, 35, 31, 6, 0, 31, 24, 16, 21, ..."
89,"[-0.025502361, -0.02449013, -0.023073005, -0.0...",154624,"[7, 20, 33, 6, 0, 35, 6, 0, 11, 30, 6, 24, 8, ..."
90,"[0.014637972, 0.04831052, 0.04107883, 0.011022...",147551,"[30, 16, 6, 9, 32, 0, 14, 28, 9, 33, 16, 8, 0,..."


In [31]:
print(f"After preprocessing:\nTrain length: {len(train)}\nDev length: {len(dev)}\nTest length: {len(test)}", flush=True)
print(f"Total dataset size: {len(train)+len(dev)+len(test)} samples")

After preprocessing:
Train length: 3410
Dev length: 1006
Test length: 1104
Total dataset size: 5520 samples


In [32]:
with open("train.pkl", "wb") as f:
    pickle.dump(train, f, protocol=4)

with open("dev.pkl", "wb") as f:
    pickle.dump(dev, f, protocol=4)

with open("test.pkl", "wb") as f:
    pickle.dump(test, f, protocol=4)