# Fine-Tuning Wav2Vec2-Large-XLSR-52 on MGB-3

In [1]:
import pandas as pd
import numpy as np
import re
import json
from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2FeatureExtractor, Wav2Vec2Processor
import torchaudio

### Constants

In [2]:
language_code = 'ar'
language_name = 'arabic'
base_model = "facebook/wav2vec2-large-xlsr-53"

train_dir = "./mgb3/adapt/Omar/"
dev_dir = "./mgb3/dev/Omar/"
test_dir = "./mgb3/test/Omar/"
wav_dir = "./mgb3/wav/"

data_dir = "./data/"

columns = ["filename", "start_time", "end_time", "sentence", "phoneme"]

output_models_dir = f"./workspace/output_models/{language_code}/wav2vec2-large-xlsr-{language_name}-demo"
new_output_models_dir = f"./workspace/output_models/{language_code}/wav2vec2-large-xlsr-{language_name}"

## Prepare Data, Tokenizer, Feature Extractor
### Prepare Data

In [3]:
def split(strng, sep, pos):
    strng = strng.split(sep)
    return [sep.join(strng[:pos]), sep.join(strng[pos:])]

def parse_text_lines(lines):
    # split name and time from speech
    lines = [line.split(" ", 1) for line in lines]
    
    # split name and time from each other
    name_time = [split(line[0], "_", 4) for line in lines]

    # split start time from end time
    time_splitted = [nt[1].split('_') for nt in name_time]
    
    for i in range(len(lines)):
        # remove old time
        del name_time[i][1]
        # add splitted time
        name_time[i].append(time_splitted[i][0])
        name_time[i].append(time_splitted[i][1])
        # add speech
        name_time[i].append(lines[i][1])
        
    return name_time

def load_data(data_dir):
    # load speech file name, times and speech
    with open(f"{data_dir}text_noverlap", encoding='utf-8', errors='ignore') as f:
        lines = f.readlines()
    lines = parse_text_lines(lines)

    # load phonemes
    with open(f"{data_dir}text_noverlap.bw", encoding='utf-8', errors='ignore') as f:
        lines2 = f.readlines()
    lines2 = [line.split(" ", 1)[1] for line in lines2]
    
    assert len(lines) == len(lines2)
    for i in range(len(lines)):
        lines[i].append(lines2[i])
    return lines

In [4]:
lines = load_data(train_dir)

data = pd.DataFrame(lines, columns=columns)
data.to_csv(f'{train_dir}train.csv', index=False)
data.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_72_first_12min,23.652,31.356,مساء الخير أهلاً ومرحباً بيكم في حلقة جديدة من...,msA' Alxyr >hlA wmrHbA bykm fy Hlqp jdydp mn b...
1,comedy_72_first_12min,31.356,36.621,عشان يظهر موهبته للناس كلها، نفسه بس إن هو ياخ...,E$An yZhr mwhbth llnAs klhA nfsh bs <n hw yAxd...
2,comedy_72_first_12min,36.621,46.166,يمكن إحنا زمان الفرصة أتاحت لنا أتاحت لنا الفر...,ymkn <HnA zmAn AlfrSp >tAHt lnA >tAHt lnA Alfr...
3,comedy_72_first_12min,46.166,51.581,وجه الزمن وجه الوقت وجه المكان اللي نقدم ناس غ...,wjh Alzmn wjh Alwqt wjh AlmkAn Ally nqdm nAs g...
4,comedy_72_first_12min,51.581,59.684,الحمد لله إحنا بعد ما إتشهرنا جداً والناس عرفت...,AlHmd llh <HnA bEd mA <t$hrnA jdA wAlnAs Erftn...


In [5]:
lines = load_data(dev_dir)

data = pd.DataFrame(lines, columns=columns)
data.to_csv(f'{dev_dir}dev.csv', index=False)
data.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_75_first_12min,0.0,8.19,أهلاً وسهلاً، أهلاً وسهلاً ومرحباً بيكم وحلقة ...,>hlA wshlA >hlA wshlA wmrHbA bykm wHlqp jdydp ...
1,comedy_75_first_12min,8.19,16.7,حلقة النهارده بنتكلم فيها عن التحرش\n,Hlqp AlnhArdh bntklm fyhA En AltHr$ \n
2,comedy_75_first_12min,16.7,24.506,يا سلام ع الفرحة يا سلام، طبعاً هو التحرش ده أ...,yA slAm E AlfrHp yA slAm TbEA hw AltHr$ dh >ky...
3,comedy_75_first_12min,24.506,33.824,كان موجود من زمان بس يمكن في الفترة الأخيرة اب...,kAn mwjwd mn zmAn bs ymkn fy Alftrp Al>xyrp Ab...
4,comedy_75_first_12min,33.824,41.227,طب هل المشكلة في اللبس هل المشكلة في الأخلاق ه...,Tb hl Alm$klp fy Allbs hl Alm$klp fy Al>xlAq h...


In [6]:
lines = load_data(test_dir)

data = pd.DataFrame(lines, columns=columns)
data.to_csv(f'{test_dir}test.csv', index=False)
data.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_09_first_12min,4.754,12.352,سلامات عليكم، حأكلمكو النهارده عن الـ (uncerta...,slAmAt Elykm H>klmkw AlnhArdh En Al @@LAT(unce...
1,comedy_09_first_12min,12.352,19.989,الأول في معضلة لغوية كده عايز أخلص منها الأول،...,Al>wl fy mEDlp lgwyp kdh EAyz >xlS mnhA Al>wl ...
2,comedy_09_first_12min,19.989,28.455,لو حد بص على الترجمة بتاعتها في القاموس حيلاقي...,lw Hd bS ElY Altrjmp btAEthA fy AlqAmws HylAqy...
3,comedy_09_first_12min,28.455,37.159,وأول ملحوظة عايزين نلاحظها على الحكاية دي، إن ...,w>wl mlHwZp EAyzyn nlAHZhA ElY AlHkAyp dy <n m...
4,comedy_09_first_12min,37.159,46.42,الناقص ممكن يبقى واحد ممكن يبقى عشرة ممكن يبقى...,AlnAqS mmkn ybqY wAHd mmkn ybqY E$rp mmkn ybqY...


In [7]:
train = pd.read_csv(f'{train_dir}train.csv')
dev   = pd.read_csv(f'{dev_dir}dev.csv')
test  = pd.read_csv(f'{test_dir}test.csv')

### Create Wav2Vec2CTCTokenizer

In [8]:
chars_to_ignore_regex = '[\,\؟\.\!\-\;\:\'\"\☭\«\»\؛\—\ـ\_\،\“\%\‘\”\�\(\)\#]'

def remove_special_characters(sentence):  
    sentence = re.sub(chars_to_ignore_regex, '', sentence.lower() + " ")
    sentence = re.sub('\n', '', sentence)
    sentence = re.sub('[a-z]','', sentence)
    sentence = re.sub("[إأٱآا]", "ا", sentence)
    noise = re.compile(""" ّ    | # Tashdid
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    sentence = re.sub(noise, '', sentence)
    return sentence   

In [9]:
train['sentence'] = train['sentence'].apply(remove_special_characters)
dev['sentence']   = dev['sentence'].apply(remove_special_characters)
test['sentence']  = test['sentence'].apply(remove_special_characters) 

In [10]:
train.head()

Unnamed: 0,filename,start_time,end_time,sentence,phoneme
0,comedy_72_first_12min,23.652,31.356,مساء الخير اهلا ومرحبا بيكم في حلقة جديدة من ب...,msA' Alxyr >hlA wmrHbA bykm fy Hlqp jdydp mn b...
1,comedy_72_first_12min,31.356,36.621,عشان يظهر موهبته للناس كلها نفسه بس ان هو ياخد...,E$An yZhr mwhbth llnAs klhA nfsh bs <n hw yAxd...
2,comedy_72_first_12min,36.621,46.166,يمكن احنا زمان الفرصة اتاحت لنا اتاحت لنا الفر...,ymkn <HnA zmAn AlfrSp >tAHt lnA >tAHt lnA Alfr...
3,comedy_72_first_12min,46.166,51.581,وجه الزمن وجه الوقت وجه المكان اللي نقدم ناس غ...,wjh Alzmn wjh Alwqt wjh AlmkAn Ally nqdm nAs g...
4,comedy_72_first_12min,51.581,59.684,الحمد لله احنا بعد ما اتشهرنا جدا والناس عرفتن...,AlHmd llh <HnA bEd mA <t$hrnA jdA wAlnAs Erftn...


In [11]:
def join_chars_df_col(df, col):
    all_text = " ".join(df[col])
    vocab = list(set(all_text))
    return {"vocab": [vocab], "all_text": [all_text]}

In [12]:
vocab_train = join_chars_df_col(train, 'sentence')
vocab_dev   = join_chars_df_col(dev, 'sentence')
vocab_test  = join_chars_df_col(test, 'sentence')

In [13]:
vocab_list = sorted(list(set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])))

In [14]:
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict

{' ': 0,
 '/': 1,
 '2': 2,
 'ء': 3,
 'ؤ': 4,
 'ئ': 5,
 'ا': 6,
 'ب': 7,
 'ة': 8,
 'ت': 9,
 'ث': 10,
 'ج': 11,
 'ح': 12,
 'خ': 13,
 'د': 14,
 'ذ': 15,
 'ر': 16,
 'ز': 17,
 'س': 18,
 'ش': 19,
 'ص': 20,
 'ض': 21,
 'ط': 22,
 'ظ': 23,
 'ع': 24,
 'غ': 25,
 'ف': 26,
 'ق': 27,
 'ك': 28,
 'ل': 29,
 'م': 30,
 'ن': 31,
 'ه': 32,
 'و': 33,
 'ى': 34,
 'ي': 35}

In [15]:
vocab_dict["|"] = vocab_dict[" "]
del vocab_dict[" "]

In [16]:
vocab_dict["[UNK]"] = len(vocab_dict)
vocab_dict["[PAD]"] = len(vocab_dict)
len(vocab_dict)

38

In [17]:
with open("vocab.json", 'w') as vocab_file:
    json.dump(vocab_dict, vocab_file)

In [18]:
tokenizer = Wav2Vec2CTCTokenizer("vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|")
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, 
                                             do_normalize=True, return_attention_mask=True)
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)

In [19]:
processor.save_pretrained(new_output_models_dir)

### Create Wav2Vec2FeatureExtractor

### Data Preprocessing

## Training
### Setup Trainer

### Training