In [1]:
import re
from pathlib import Path
import string
from functools import reduce
from math import log
import itertools
import csv

In [26]:
# Enter smoothing or no smoothing.
smoothing = 1

In [27]:
def load_file(filename):
  dataset = []
  with open(filename, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
      lyrics = row.get('Lyrics', '')
      dataset.append(lyrics)
  return dataset

In [28]:
def tokenize_sentence(lines):
    tokenized_sentences = [re.findall(r'\b\w+\b', sentence.lower()) for sentence in lines]
    return tokenized_sentences

In [30]:
def prep_data(lines):
    processed_data = []
    for sentence in lines:
        # Remove punctuations
        sentence = [word for word in sentence if word not in string.punctuation]
        # Remove empty strings
        sentence = [word for word in sentence if word]
        # Lowercase all the words
        sentence = [word.lower() for word in sentence]
        # Add <s> at the beginning and </s> at the end of every sentence
        sentence = ['<s>'] + sentence + ['</s>']
        processed_data.append(sentence)
    return processed_data

In [31]:
dataset = load_file("/content/songs.csv")
dataset = tokenize_sentence(dataset)
dataset = prep_data(dataset)

**Output:**

No of sentences in Corpus: 10059

No of sentences in Corpus: 10059

No of sentences in Corpus: 10059


In [32]:
# Creates the vocabulary file of the dataset.
def vocabulary(dataset):
    dataset_vocab = set(itertools.chain.from_iterable(dataset))
    # remove <s> and </s> from the vocabulary of the dataset
    dataset_vocab.remove('<s>')
    dataset_vocab.remove('</s>')
    dataset_vocab = list(dataset_vocab)
    dataset_vocab.append('<s>')
    dataset_vocab.append('</s>')
    return dataset_vocab

dataset_vocab = vocabulary(dataset)

In [33]:
len(dataset_vocab)

48928

In [34]:
def freq_of_unique_words(lines):
    bag_of_words = list(itertools.chain.from_iterable(lines))  # change the nested list to one single list
    word_count = len(bag_of_words) - 2 # No of words in the corpus excluding <s> and </s>.
    #count the no. of times a word repeats
    count = {}

    for word in bag_of_words:
        if word not in count:
            count[word] = 1
        else:
            count[word] += 1


    # Number of unique words in the corpus excluding <s> and </s>
    unique_word_count = len(set(bag_of_words)) - 2  # Subtract 2 for <s> and </s>

    #print("!!! IT IS EXCLUDING <s> AND </s> !!!")
    print("No of unique words : "+ str(unique_word_count))
    print("No of words : "+ str(word_count))

    return count

In [35]:
unique_word_frequency = freq_of_unique_words(dataset)
len(unique_word_frequency)

No of unique words : 48926
No of words : 382097


48928

**`Expected Output: `**

No of unique words in corpus : 17139
No of words in corpus: 218619

In [36]:
def compute_bigram_frequencies(lines):
    bigram_frequencies = dict()
    #unique_bigrams = set()

    for sentence in lines:
        for i in range(len(sentence) - 1):
            current_word = sentence[i]
            next_word = sentence[i + 1]
            bigram = (current_word, next_word)

            if bigram not in bigram_frequencies:
                bigram_frequencies[bigram] = 1
            else:
                bigram_frequencies[bigram] += 1

    #The number of bigram_frquencies
    #print(len(bigram_frequencies))
    return bigram_frequencies


In [38]:
bigram_frequencies = compute_bigram_frequencies(dataset)
#print(bigram_frequencies)
bigram_unique_word_count = len(unique_word_frequency)
print("\n"+"No of words in bigram: "+str(bigram_unique_word_count))


No of words in bigram: 48928


In [39]:
def compute_bigram_probabilities(bigram_frequencies, count):
    bigram_probabilities = dict()
    for key in bigram_frequencies:
        numerator = bigram_frequencies[key]
        denominator = count.get(key[0], 0)
        if (numerator ==0 or denominator==0):
            bigram_probabilities[key] = 0.0
        else:
            bigram_probabilities[key] = numerator / denominator
    return bigram_probabilities



In [40]:
bigram_probabilities = compute_bigram_probabilities(bigram_frequencies,unique_word_frequency)


In [41]:
def compute_bigram_count_test_sentence(given_word,word,smoothing):
    if smoothing==0:
        return 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))
    elif smoothing == 1:
        return 1 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))+1

In [42]:
# A table showing the bigram counts for test sentence.
def print_bigram_freq_test_sentence(test_sentence_vocab,smoothing):
    print("A table showing the bigram counts for test sentence."+"\nsmoothing ="+str(smoothing))
    print("\t\t\t", end="")
    for word in test_sentence_vocab:
        if word != '<s>':
            print(word, end="\t\t")
    print("")
    for given_word in test_sentence_vocab:
        if given_word != '</s>':
            if(smoothing==1):
                print(unique_word_frequency.get(given_word)+bigram_unique_word_count, end ="\t")
            elif(smoothing==0):
                print(unique_word_frequency.get(given_word), end ="\t")
            print(given_word, end="\t\t")
            for word in test_sentence_vocab:
                if word !='<s>':
                    print("{0:}".format(compute_bigram_count_test_sentence(given_word,word,smoothing)), end="\t\t")
            print("")
    print("")

In [43]:
# Bigram probabilities of the test sentence computed using the bigram probabilities of the training data.
# add-one smoothing if 1, no smoothing if 0 ---- smoothing
def compute_bigram_prob_test_sentence(given_word,word,smoothing):
    bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))
    uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))
    if smoothing==0:
        return 0 if bigram_probabilities.get((given_word,word))==None else bigram_probabilities.get((given_word,word))
    elif smoothing == 1:
        numerator = bigram_freq+1
        denominator = uni_freq+bigram_unique_word_count
        return 0.0 if numerator == 0 or denominator == 0 else float(numerator) / float(denominator)

In [44]:
# A table showing the bigram probabilities for test sentence.
def print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing):
    print("A table showing the bigram probabilities for test sentence"+"\nsmoothing ="+str(smoothing))
    print("\t\t", end="")
    for word in test_sentence_vocab:
        if word != '<s>':
            print(word, end="\t\t")
    print("")
    for given_word in test_sentence_vocab:
        if given_word != '</s>':
            print(given_word, end="\t\t")
            for word in test_sentence_vocab:
                if word !='<s>':
                    print("{0:.5f}".format(compute_bigram_prob_test_sentence(given_word,word,smoothing)), end="\t\t")
            print("")
    print("")

In [45]:
# Print the probability of the test sentence
# for add-one smoothing if 1, no smoothing if 0
def compute_prob_test_sentence(sentence,smoothing):
    test_sent_prob = 0

    if(smoothing == 0):
        given_word = None
        for word in sentence:
            if given_word!=None:
                if bigram_probabilities.get((given_word,word))==0 or bigram_probabilities.get((given_word,word))== None:
                    return 0
                else:
                    test_sent_prob+=log((bigram_probabilities.get((given_word,word),0)),10)
            given_word = word

    elif(smoothing ==1):
        given_word = None
        for word in sentence:
            if given_word!=None:
                bigram_freq = 0 if bigram_frequencies.get((given_word,word))==None else bigram_frequencies.get((given_word,word))
                uni_freq = 0 if unique_word_frequency.get((given_word))==None else unique_word_frequency.get((given_word))
                numerator = bigram_freq+1
                denominator = uni_freq+bigram_unique_word_count
                probability = 0 if numerator==0 or denominator ==0 else float(numerator)/float(denominator)
                if(probability==0):
                    return 0
                test_sent_prob +=log(probability,10)
            given_word = word

    return 10**test_sent_prob


In [46]:
# Test sentence here
test_sentences = [['بايع و شاري رابح في خطاري مركيت نهاري تعرف اخباري'],['و باقي نجيبو المال نعيش معاهم رايض أما عندي ناموسي حرب مخّي خايض']]

In [47]:
for i in range (len(test_sentences)):
    test_sentence = test_sentences[i]
    print("!!!!!!!!!!The test Sentence is!!!!!!!!!!")
    print(test_sentence)
    test_sentence = tokenize_sentence(test_sentence)
    test_sentence = prep_data(test_sentence)

    # Vocabulary of test sentence
    test_sentence_vocab = vocabulary(test_sentence)

    test_sentence = list(itertools.chain.from_iterable(test_sentence))
    #test_sentence

    # A table showing the bigram counts for test sentence.
    print_bigram_freq_test_sentence(test_sentence_vocab,smoothing)

    # A table showing the bigram probabilities for test sentence.
    print_bigram_probabilities_test_sentence(test_sentence_vocab,smoothing)

    # The probability of the sentence under the trained model
    print("The probability of the sentence under the trained model"+"\nsmoothing ="+str(smoothing))
    print(compute_prob_test_sentence(test_sentence,0))

!!!!!!!!!!The test Sentence is!!!!!!!!!!
['بايع و شاري رابح في خطاري مركيت نهاري تعرف اخباري']
A table showing the bigram counts for test sentence.
smoothing =1
			اخباري		خطاري		و		شاري		مركيت		في		بايع		تعرف		نهاري		رابح		</s>		
48932	اخباري		1		1		1		1		1		1		1		1		1		1		1		
48932	خطاري		1		1		1		1		5		1		1		1		1		1		1		
50300	و		1		1		4		6		1		10		1		1		1		1		3		
48937	شاري		1		1		1		1		1		1		1		1		1		5		1		
48932	مركيت		1		1		1		1		1		1		1		1		5		1		1		
49746	في		1		5		1		1		1		1		1		1		1		1		1		
48932	بايع		1		1		5		1		1		1		1		1		1		1		1		
48948	تعرف		5		1		1		1		1		1		1		1		1		1		1		
48934	نهاري		1		1		1		1		1		1		1		5		1		1		1		
48939	رابح		1		1		2		1		1		5		1		1		1		1		1		
49920	<s>		1		1		2		1		1		4		3		1		1		1		2		

A table showing the bigram probabilities for test sentence
smoothing =1
		اخباري		خطاري		و		شاري		مركيت		في		بايع		تعرف		نهاري		رابح		</s>		
اخباري		0.00002		0.00002		0.00002		0.00002		0.00002		0.00002		0.00002		0.00002		0.00002		0.00002		0.00002		
خطاري		0.0000