In [1]:
import os 
import pandas as pd
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import Laplace
import pickle
import numpy as np
import textdistance
from collections import Counter
from jiwer import wer

In [3]:
metadata_path = 'metadata.csv'

metadata_df = pd.read_csv(metadata_path)
metadata_df.drop(columns=metadata_df.columns[0], axis=1, inplace=True)
metadata_df = metadata_df.sample(frac=1,random_state=0).reset_index(drop=True)
metadata_df = metadata_df[:round(len(metadata_df))]
split1 = int(len(metadata_df) * 0.85) #85% training set
split2 = int(len(metadata_df) * 0.95) #10% validation set 5% test set
df_train = metadata_df[:split2]
df_test = metadata_df[split2:]

print(f"Size of the training set: {len(df_train)}")
print(f"Size of the test set: {len(df_test)}")
metadata_df.head(10)

Size of the training set: 22512
Size of the test set: 1185


Unnamed: 0,wav_filename,transcript
0,resampled_audio_14440,air berlin eight four three nine
1,resampled_audio_25292,csa six three five runway three one clear to l...
2,resampled_audio_8178,hapag lloyd three five five one descend now to...
3,resampled_audio_4829,sunwing five two one two contact zurich one th...
4,resampled_audio_12516,eurotrans four eight zero
5,resampled_audio_17839,aeroflot one four one turn right heading two t...
6,resampled_audio_7488,fox oscar kilo sierra india good morning radar...
7,resampled_audio_12451,central charter one five eight praha radar con...
8,resampled_audio_8987,thank you
9,resampled_audio_2534,and rate two thousand feet a minute or more


In [4]:
#creating a list of all the sentences in the training set
sentence = []
for i in df_train['transcript']:
    sentence.append(i)
print(sentence)

['air berlin eight four three nine', 'csa six three five runway three one clear to land wind one six zero degrees three knots', 'hapag lloyd three five five one descend now to flight level two five zero', 'sunwing five two one two contact zurich one three four decimal six', 'eurotrans four eight zero', 'aeroflot one four one turn right heading two two zero vec', 'fox oscar kilo sierra india good morning radar contact', 'central charter one five eight praha radar contact', 'thank you', 'and rate two thousand feet a minute or more', 'csa six three five turn left heading three four zero cleared ils approach runway three one report when established', 'end flight level one zero zero down level one hundred one zero zero', 'delta two one zero descend flight level nine zero', 'ank you direct to and no speed limit csa four y a', 'line up runway three one to wait easy two five eight', 'transwede one zero seven geneva one three three one five good bye', 'and csa four two zero confirm three seven 

In [5]:
#splitting each sentence into a list
text=[]
for i in sentence:
    text.append(i.split())
print(text)

[['air', 'berlin', 'eight', 'four', 'three', 'nine'], ['csa', 'six', 'three', 'five', 'runway', 'three', 'one', 'clear', 'to', 'land', 'wind', 'one', 'six', 'zero', 'degrees', 'three', 'knots'], ['hapag', 'lloyd', 'three', 'five', 'five', 'one', 'descend', 'now', 'to', 'flight', 'level', 'two', 'five', 'zero'], ['sunwing', 'five', 'two', 'one', 'two', 'contact', 'zurich', 'one', 'three', 'four', 'decimal', 'six'], ['eurotrans', 'four', 'eight', 'zero'], ['aeroflot', 'one', 'four', 'one', 'turn', 'right', 'heading', 'two', 'two', 'zero', 'vec'], ['fox', 'oscar', 'kilo', 'sierra', 'india', 'good', 'morning', 'radar', 'contact'], ['central', 'charter', 'one', 'five', 'eight', 'praha', 'radar', 'contact'], ['thank', 'you'], ['and', 'rate', 'two', 'thousand', 'feet', 'a', 'minute', 'or', 'more'], ['csa', 'six', 'three', 'five', 'turn', 'left', 'heading', 'three', 'four', 'zero', 'cleared', 'ils', 'approach', 'runway', 'three', 'one', 'report', 'when', 'established'], ['end', 'flight', 'leve

In [6]:
#creating a 3 gram language model with nltk
n = 3
train, vocab = padded_everygram_pipeline(n, text)
model=Laplace(n)
model.fit(train, vocab)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 2310 items>


In [34]:
import pickle 

with open('deepspeech_languagemodel_nltk', 'wb') as fout:
    pickle.dump(model, fout)

In [36]:
everygram_score('flight','level','one')

-3.4406967822785646

In [37]:
everygram_score('flight','level','hello')

-11.671889278673982

In [38]:
everygram_score('flight','level','banana')

-14.686687318460136

In [5]:
with open('deepspeech_languagemodel_nltk', 'rb') as fin:
    lm = pickle.load(fin)

def everygram_score(word1,word2,word3):
    return (lm.logscore(word3,(word1+' '+word2).split()) + lm.logscore(word3,word2.split()) + lm.logscore(word3))/3
    

##  4 gram

In [8]:
n = 4
train, vocab = padded_everygram_pipeline(n, text)
model=Laplace(n)
model.fit(train, vocab)
print(model.vocab)

<Vocabulary with cutoff=1 unk_label='<UNK>' and 2310 items>


In [7]:
import pickle 

with open('deepspeech_languagemodel_nltk_4gram', 'wb') as fout:
    pickle.dump(model, fout)

In [9]:
with open('deepspeech_languagemodel_nltk_4gram', 'rb') as fin:
    lm = pickle.load(fin)

def everygram_score(word1,word2,word3,word4):
    return 0.4*lm.logscore(word4,(word1+' '+word2+' '+word3).split())\
         + 0.3*lm.logscore(word4,(word2+' '+word3).split())\
         + 0.2*lm.logscore(word4,word3.split())\
         + 0.1*lm.logscore(word4)


In [10]:
def calculate_score(prediction):
    prediction=prediction.split()
    if len(prediction)==0:
        return everygram_score('<s>','<s>','<s>',' ')
    
    for i in range(len(prediction)):
        if i==0:
            score=everygram_score('<s>','<s>','<s>',prediction[i])
        elif i==1:
            score+=everygram_score('<s>','<s>',prediction[i-1],prediction[i])
        elif i==2:
            score+=everygram_score('<s>',prediction[i-2],prediction[i-1],prediction[i])
        else:
            score+=everygram_score(prediction[i-3],prediction[i-2],prediction[i-1],prediction[i])
    
    if len(prediction)==1:
        score+=everygram_score('<s>','<s>',prediction[-1],'</s>') 
        score+=everygram_score('<s>',prediction[-1],'</s>','</s>')  
        score+=everygram_score(prediction[-1],'</s>','</s>','</s>')
        
    elif len(prediction)==2:
        score+=everygram_score('<s>',prediction[-2],prediction[-1],'</s>') 
        score+=everygram_score(prediction[-2],prediction[-1],'</s>','</s>')  
        score+=everygram_score(prediction[-1],'</s>','</s>','</s>')
        
    else:    
        score+=everygram_score(prediction[-3],prediction[-2],prediction[-1],'</s>') 
        score+=everygram_score(prediction[-2],prediction[-1],'</s>','</s>')  
        score+=everygram_score(prediction[-1],'</s>','</s>','</s>')  
            
    return score   

## Spelling checker

In [8]:
#creating a dictionary with probability of each word for the spelling checker
words=[]
for i in text:
    for j in i:
        words.append(j)
        
V=set(words)
word_freq=Counter(words)

probs = {}     
Total = sum(word_freq.values())    
for k in word_freq.keys():
    probs[k] = word_freq[k]/Total
    
for key in list(probs.keys()):
    if probs[key]<5*10**-5:
        del probs[key]
        del word_freq[key]
        V.remove(key)    

In [9]:
# with open('deepspeech2_trained/autocorrect_probability', 'wb') as fout:
#     pickle.dump(probs, fout)

In [11]:
#spelling corrector
with open('deepspeech2_trained/autocorrect_probability', 'rb') as fin:
    probs = pickle.load(fin)
    
def autocorrect(input_word):
    if input_word in probs.keys():
        pass
    else:
        sim = [1-(textdistance.levenshtein.normalized_distance(v,input_word)+textdistance.Jaccard(qval=2).distance(v,input_word))/2 for v in probs.keys()]
        df = pd.DataFrame.from_dict(probs, orient='index').reset_index()
        df = df.rename(columns={'index':'Word', 0:'Prob'})
        df['Similarity'] = sim
        output = df.sort_values(['Similarity', 'Prob'], ascending=False)[0:3]['Word']
        return(output)

def get_corrected_sentence(prediction):
    prediction=prediction.split()
    for i in range(len(prediction)):

        if prediction[i] not in probs.keys():
            
            if i==len(prediction)-1:

                if i==0:
                    word_score=(everygram_score('<s>','<s>',prediction[i]) + everygram_score('<s>',prediction[i],'</s>'))/2
                elif i==1:
                    word_score=(everygram_score('<s>',prediction[i-1],prediction[i]) + everygram_score(prediction[i-1],prediction[i],'</s>'))/2
                else:
                    word_score=(everygram_score(prediction[i-2],prediction[i-1],prediction[i]) + everygram_score(prediction[i-1],prediction[i],'</s>'))/2

                for corrected_word in autocorrect(prediction[i]):
                    if i==0:
                        corrected_word_score=(everygram_score('<s>','<s>',corrected_word) + everygram_score('<s>',corrected_word,'</s>'))/2
                    elif i==1:
                        corrected_word_score=(everygram_score('<s>',prediction[i-1],corrected_word) + everygram_score(prediction[i-1],corrected_word,'</s>'))/2
                    else:
                        corrected_word_score=(everygram_score(prediction[i-2],prediction[i-1],corrected_word) + everygram_score(prediction[i-1],corrected_word,'</s>'))/2

                    
                    if corrected_word_score>word_score:
                        word_score = corrected_word_score
                        prediction[i]=corrected_word
                    

            else:

                if i==0:
                    word_score=(everygram_score('<s>','<s>',prediction[i]) + everygram_score('<s>',prediction[i],prediction[i+1]))/2
                elif i==1:
                    word_score=(everygram_score('<s>',prediction[i-1],prediction[i]) + everygram_score(prediction[i-1],prediction[i],prediction[i+1]))/2
                else:
                    word_score=(everygram_score(prediction[i-2],prediction[i-1],prediction[i]) + everygram_score(prediction[i-1],prediction[i],prediction[i+1]))/2
               
                for corrected_word in autocorrect(prediction[i]):
                    if i==0:
                        corrected_word_score=(everygram_score('<s>','<s>',corrected_word) + everygram_score('<s>',corrected_word,prediction[i+1]))/2
                    elif i==1:
                        corrected_word_score=(everygram_score('<s>',prediction[i-1],corrected_word) + everygram_score(prediction[i-1],corrected_word,prediction[i+1]))/2
                    else:
                        corrected_word_score=(everygram_score(prediction[i-2],prediction[i-1],corrected_word) + everygram_score(prediction[i-1],corrected_word,prediction[i+1]))/2

                    
                    if corrected_word_score > word_score:
                        word_score = corrected_word_score
                        prediction[i]=corrected_word  
                    
    return ' '.join(prediction)



In [13]:
get_corrected_sentence('foxtrot sierra india contact zurich one three four decim six')

'foxtrot sierra india contact zurich one three four decimal six'