# Partie I : Build a Language Model

In this Notebook we will be building a simple Ngram model that calculate the Ngram probability based on the ngram_size

In a bigram model, the probability of a word $ ( w_2 ) $ given the previous word $ ( w_1 ) $ can be calculated using the formula:

$ P(w_2 | w_1) = \frac{count(w_1, w_2)}{count(w_1)} $


In a trigram model, the probability of a word $ ( w_3 ) $ given the previous words $ ( w_2 , w_1 ) $ can be calculated using the formula:

$ P(w_3 |w_2, w_1) = \frac{count(w_1, w_2, w_3)}{count(w_1, w_2)} $

In [158]:
from collections import defaultdict, Counter
import math
import re
import os

In [2]:
def count_tokens(corpus, exclude_token): return sum(1 for token in corpus if token != exclude_token)

## Building the model

In [175]:
class NgramLanguageModel:

    def __init__(self, ngram_size):

        self.trigram_counts = defaultdict(int)
        self.bigram_counts = defaultdict(int)
        self.unigram_counts = defaultdict(int)

        self.k = 0.01
        self.ngram_size = ngram_size

        self.probabilities = defaultdict(float)

        self.tokens = []
        
        self.replacements = {'.': '.<stop>', '?': '?<stop>', '!': ' !<stop>'}


    def prepare_data(self,infile):

        if os.path.exists(infile) :

            with open(infile, 'r') as file:
                text = file.read()

            for char, replacement in self.replacements.items():
                text = text.replace(char, replacement)

            text = text.lower()

            sentences = text.split("<stop>")
            sentences = [s.strip() for s in sentences]
            sentences = ['<s> '* (self.ngram_size-1) + item + " </s>" for item in sentences]
            tokens = [word for sentence in [sentence.split() for sentence in sentences] for word in sentence]

            for value, count in Counter(tokens).items():
                if(count <= 1) :
                    text = re.sub(r'\b{}\b'.format(re.escape(value)), '<UNK>', text)

            sentences_unk = text.split("<stop>")
            sentences_unk = [s.strip() for s in sentences_unk]
            sentences_unk = ['<s> '* (self.ngram_size-1) + item + " </s>" for item in sentences_unk]

            tokens_unk = [word for sentence in [sentence.split() for sentence in sentences_unk] for word in sentence]

            return tokens_unk, tokens

        else :
            print(f"File {infile} does not exist")


    def train(self, infile):

        self.tokens, _= self.prepare_data(infile)

        if self.ngram_size == 2:

            self.bigram_counts = Counter(zip(self.tokens, self.tokens[1:]))
            self.unigram_counts = Counter(self.tokens)

            for bigram in self.bigram_counts:

                probability = math.log((self.bigram_counts[bigram] + 1 * self.k) / (self.unigram_counts[bigram[0]] + len(set(self.tokens))* self.k))
                self.probabilities[bigram] = probability

        if self.ngram_size == 3:

            self.trigram_counts = Counter(zip(self.tokens, self.tokens[1:], self.tokens[2:]))
            self.bigram_counts = Counter(zip(self.tokens, self.tokens[1:]))

            for trigram in self.trigram_counts:

                previous_bigram = (trigram[0], trigram[1])

                if previous_bigram in self.bigram_counts:

                    previous_word_count = self.bigram_counts[previous_bigram]
                    probability = math.log((self.trigram_counts[trigram] + 1* self.k) / (previous_word_count + len(set(self.tokens))* self.k))
                    self.probabilities[trigram] = probability

        return self.probabilities


    def predict_ngram(self, sentence):

        probability = 0.0

        # Preprocessing the phrase
        sentence = "<s> " * (self.ngram_size-1) + sentence +" </s>"
        tokens = sentence.lower().split(" ")
        corpus = ['<UNK>' if word not in self.tokens else word for word in tokens]

        if self.ngram_size == 2:

              bigrams = Counter(zip(corpus, corpus[1:]))
              for bigram in bigrams :

                  if bigram in self.bigram_counts:
                    probability += self.probabilities[bigram]

                  else :
                    previous_word_count = self.unigram_counts[bigram[0]]
                    probability+= math.log(1* self.k / (previous_word_count + len(set(self.tokens))* self.k))


        elif self.ngram_size == 3:

                trigrams = Counter(zip(corpus, corpus[1:], corpus[2:]))
                for trigram in trigrams:

                    if trigram in self.trigram_counts:
                        probability += self.probabilities[trigram]

                    else:
                        previous_bigram = (trigram[0], trigram[1])

                        if previous_bigram in self.bigram_counts:
                            previous_words_count = self.bigram_counts[previous_bigram]
                            probability += math.log(1* self.k / (previous_words_count + len(set(self.tokens))* self.k))

                        else :
                            probability += math.log(1* self.k / len(set(self.tokens))* self.k)

        return probability


    def perplexity(self, test_file):

        _ , tokens = self.prepare_data(test_file)
        nbr_tokens = count_tokens(tokens, '<s>')

        total_probability = .0

        if self.ngram_size == 2:

            bigrams = Counter(zip(tokens, tokens[1:]))
            for bigram in bigrams :

                if bigram in self.bigram_counts:
                    total_probability+= self.probabilities[bigram]

                else :
                    previous_word_count = self.unigram_counts[bigram[0]]
                    total_probability+= math.log(1* self.k / (previous_word_count + len(set(self.tokens))* self.k))


        elif self.ngram_size == 3:

                trigrams = Counter(zip(tokens, tokens[1:], tokens[2:]))
                for trigram in trigrams:

                    if trigram in self.trigram_counts:
                        total_probability+= self.probabilities[trigram]

                    else:
                        previous_bigram = (trigram[0], trigram[1])

                        if previous_bigram in self.bigram_counts:
                            previous_words_count = self.bigram_counts[previous_bigram]
                            total_probability+= math.log(1* self.k / (previous_words_count + len(set(self.tokens))* self.k))
                        else :
                            total_probability+= math.log(1* self.k / len(set(self.tokens))* self.k)

        normalized_p = total_probability / nbr_tokens
        perplexity = math.exp(- normalized_p)

        return perplexity



## Initialize the NgramLanguageModel

In [176]:
model = NgramLanguageModel(3)

## Test the Split_into_sentences output

In [177]:
model.prepare_data("/Users/houdamoudni/Desktop/Projects/PA_files/ngramv1.train")

(['<s>',
  '<s>',
  'i',
  'am',
  'sam',
  '.',
  '</s>',
  '<s>',
  '<s>',
  'i',
  'am',
  'sam',
  '.',
  '</s>',
  '<s>',
  '<s>',
  'sam',
  'i',
  'am',
  '.',
  '</s>',
  '<s>',
  '<s>',
  'that',
  'sam',
  'i',
  'am',
  '!',
  '</s>',
  '<s>',
  '<s>',
  'that',
  'sam',
  'i',
  'am',
  '!',
  '</s>',
  '<s>',
  '<s>',
  'i',
  'do',
  'not',
  'like',
  'that',
  'sam',
  'i',
  'am',
  '!',
  '</s>',
  '<s>',
  '<s>',
  'do',
  'would',
  'you',
  'like',
  'green',
  'eggs',
  'and',
  'ham',
  '?',
  '</s>',
  '<s>',
  '<s>',
  'i',
  'do',
  'not',
  'like',
  'them',
  ',',
  'sam',
  'i',
  'am',
  '.',
  '</s>',
  '<s>',
  '<s>',
  'i',
  'do',
  'not',
  'like',
  'green',
  'eggs',
  'and',
  'ham',
  '.',
  '</s>',
  '<s>',
  '<s>',
  'would',
  'you',
  'like',
  'them',
  'here',
  'or',
  'there',
  '?',
  '</s>',
  '<s>',
  '<s>',
  'i',
  'would',
  'not',
  'like',
  'them',
  'here',
  'or',
  'there',
  '.',
  '</s>',
  '<s>',
  '<s>',
  'i',
  'would',
 

## Train the model

In [178]:
model.train("/Users/houdamoudni/Desktop/Projects/PA_files/ngramv1.train")

defaultdict(float,
            {('<s>', '<s>', 'i'): -0.7858281305955576,
             ('<s>', 'i', 'am'): -3.3002494490052054,
             ('i', 'am', 'sam'): -1.9767033448251121,
             ('am', 'sam', '.'): -0.2221480310727082,
             ('sam', '.', '</s>'): -0.2221480310727082,
             ('.', '</s>', '<s>'): -0.0069194851227258655,
             ('</s>', '<s>', '<s>'): -0.00426403511498308,
             ('<s>', '<s>', 'sam'): -3.673057266439573,
             ('<s>', 'sam', 'i'): -1.2456657066246064,
             ('sam', 'i', 'am'): -0.04078868838692761,
             ('i', 'am', '.'): -0.4765029952758499,
             ('am', '.', '</s>'): -0.054008804937052395,
             ('<s>', '<s>', 'that'): -4.076862623129373,
             ('<s>', 'that', 'sam'): -0.2221480310727082,
             ('that', 'sam', 'i'): -0.15367595871699005,
             ('i', 'am', '!'): -1.5728979881353122,
             ('am', '!', '</s>'): -0.15367595871699005,
             ('!', '</s>', '<s>'): 

## Test the model

In [169]:
model.predict_ngram("I am houda")

-20.03733630226725

## The perplexity

In [170]:
model.perplexity("/Users/houdamoudni/Desktop/Projects/PA_files/ngramv1.test")

4.525427874723109

***
**Made By :**
- *Houda Moudni* : houda.moudni@etu.uae.ac.ma
- *Chadi Mountassir* : chadi.mountassir@etu.uae.ac.ma