#Using NLTK Library

In [48]:
import nltk
from nltk.util import ngrams
from collections import Counter
from nltk.probability import FreqDist, MLEProbDist
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [49]:
shakespeare_corpus = "Oft hath he seen that wherefore he was fain."
foreign_corpus = "Le vent souffle fort sur la mer."

Tokenization

In [50]:
shakespeare_tokens = nltk.word_tokenize(shakespeare_corpus.lower())
foreign_tokens = nltk.word_tokenize(foreign_corpus.lower())

Creating N-grams

In [51]:
# Function to generate n-grams
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Creating unigrams, bigrams, and trigrams
shakespeare_unigrams = generate_ngrams(shakespeare_tokens, 1)
shakespeare_bigrams = generate_ngrams(shakespeare_tokens, 2)
shakespeare_trigrams = generate_ngrams(shakespeare_tokens, 3)

foreign_unigrams = generate_ngrams(foreign_tokens, 1)
foreign_bigrams = generate_ngrams(foreign_tokens, 2)
foreign_trigrams = generate_ngrams(foreign_tokens, 3)

print("Shakespeare Unigrams:", shakespeare_unigrams)
print('\n')
print("Shakespeare Bigrams:", shakespeare_bigrams)
print('\n')
print("Shakespeare Trigrams:", shakespeare_trigrams)
print('\n')

print("Foreign Unigrams:", foreign_unigrams)
print('\n')
print("Foreign Bigrams:", foreign_bigrams)
print('\n')
print("Foreign Trigrams:", foreign_trigrams)


Shakespeare Unigrams: [('oft',), ('hath',), ('he',), ('seen',), ('that',), ('wherefore',), ('he',), ('was',), ('fain',), ('.',)]


Shakespeare Bigrams: [('oft', 'hath'), ('hath', 'he'), ('he', 'seen'), ('seen', 'that'), ('that', 'wherefore'), ('wherefore', 'he'), ('he', 'was'), ('was', 'fain'), ('fain', '.')]


Shakespeare Trigrams: [('oft', 'hath', 'he'), ('hath', 'he', 'seen'), ('he', 'seen', 'that'), ('seen', 'that', 'wherefore'), ('that', 'wherefore', 'he'), ('wherefore', 'he', 'was'), ('he', 'was', 'fain'), ('was', 'fain', '.')]


Foreign Unigrams: [('le',), ('vent',), ('souffle',), ('fort',), ('sur',), ('la',), ('mer',), ('.',)]


Foreign Bigrams: [('le', 'vent'), ('vent', 'souffle'), ('souffle', 'fort'), ('fort', 'sur'), ('sur', 'la'), ('la', 'mer'), ('mer', '.')]


Foreign Trigrams: [('le', 'vent', 'souffle'), ('vent', 'souffle', 'fort'), ('souffle', 'fort', 'sur'), ('fort', 'sur', 'la'), ('sur', 'la', 'mer'), ('la', 'mer', '.')]


Frequency Distribution

In [52]:
shakespeare_bigram_freq = FreqDist(shakespeare_bigrams)
foreign_bigram_freq = FreqDist(foreign_bigrams)
print('Shakespeare'  ,shakespeare_bigram_freq)
print('Foreign Language  ',foreign_bigram_freq)

Shakespeare <FreqDist with 9 samples and 9 outcomes>
Foreign Language   <FreqDist with 7 samples and 7 outcomes>


 Calculating Probabilities using MLE

In [53]:
shakespeare_bigram_prob = MLEProbDist(shakespeare_bigram_freq)
foreign_bigram_prob = MLEProbDist(foreign_bigram_freq)
print('Shakespeare  ',shakespeare_bigram_prob)
print('Foreign Language  ',foreign_bigram_prob)

Shakespeare   <MLEProbDist based on 9 samples>
Foreign Language   <MLEProbDist based on 7 samples>


Finding the next word

In [54]:
def predict_next_word(word, bigram_freq, bigram_prob):
    candidates = [(pair, bigram_prob.prob(pair)) for pair in bigram_freq if pair[0] == word]
    return max(candidates, key=lambda x: x[1])[0][1] if candidates else None


print("Most likely next word after 'he' (Shakespeare):", predict_next_word('he', shakespeare_bigram_freq, shakespeare_bigram_prob))
print('\n')
print("Most likely next word after 'le' (Foreign):", predict_next_word('le', foreign_bigram_freq, foreign_bigram_prob))

Most likely next word after 'he' (Shakespeare): seen


Most likely next word after 'le' (Foreign): vent


#Using Spacy Library

In [55]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation succ

In [56]:
import spacy
from collections import Counter
from nltk.util import ngrams
from nltk.probability import FreqDist, MLEProbDist

nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")



In [57]:
shakespeare_corpus = "Oft hath he seen that wherefore he was fain."
foreign_corpus = "Le vent souffle fort sur la mer."


Tokenization

In [58]:
shakespeare_tokens = [token.text.lower() for token in nlp_en(shakespeare_corpus)]
foreign_tokens = [token.text.lower() for token in nlp_fr(foreign_corpus)]


Creating N-grams

In [59]:
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Creating unigrams, bigrams, and trigrams
shakespeare_unigrams = generate_ngrams(shakespeare_tokens, 1)
shakespeare_bigrams = generate_ngrams(shakespeare_tokens, 2)
shakespeare_trigrams = generate_ngrams(shakespeare_tokens, 3)

foreign_unigrams = generate_ngrams(foreign_tokens, 1)
foreign_bigrams = generate_ngrams(foreign_tokens, 2)
foreign_trigrams = generate_ngrams(foreign_tokens, 3)

print("Shakespeare Unigrams:", shakespeare_unigrams)
print('\n')
print("Shakespeare Bigrams:", shakespeare_bigrams)
print('\n')
print("Shakespeare Trigrams:", shakespeare_trigrams)

print('\n')
print("Foreign Unigrams:", foreign_unigrams)
print('\n')
print("Foreign Bigrams:", foreign_bigrams)
print('\n')
print("Foreign Trigrams:", foreign_trigrams)
print('\n')

Shakespeare Unigrams: [('oft',), ('hath',), ('he',), ('seen',), ('that',), ('wherefore',), ('he',), ('was',), ('fain',), ('.',)]


Shakespeare Bigrams: [('oft', 'hath'), ('hath', 'he'), ('he', 'seen'), ('seen', 'that'), ('that', 'wherefore'), ('wherefore', 'he'), ('he', 'was'), ('was', 'fain'), ('fain', '.')]


Shakespeare Trigrams: [('oft', 'hath', 'he'), ('hath', 'he', 'seen'), ('he', 'seen', 'that'), ('seen', 'that', 'wherefore'), ('that', 'wherefore', 'he'), ('wherefore', 'he', 'was'), ('he', 'was', 'fain'), ('was', 'fain', '.')]


Foreign Unigrams: [('le',), ('vent',), ('souffle',), ('fort',), ('sur',), ('la',), ('mer',), ('.',)]


Foreign Bigrams: [('le', 'vent'), ('vent', 'souffle'), ('souffle', 'fort'), ('fort', 'sur'), ('sur', 'la'), ('la', 'mer'), ('mer', '.')]


Foreign Trigrams: [('le', 'vent', 'souffle'), ('vent', 'souffle', 'fort'), ('souffle', 'fort', 'sur'), ('fort', 'sur', 'la'), ('sur', 'la', 'mer'), ('la', 'mer', '.')]




Frequency Distribution

In [60]:
shakespeare_bigram_freq = FreqDist(shakespeare_bigrams)
foreign_bigram_freq = FreqDist(foreign_bigrams)
print('Shakespeare  ',shakespeare_bigram_freq)
print('Foreign Language  ',foreign_bigram_freq)

Shakespeare   <FreqDist with 9 samples and 9 outcomes>
Foreign Language   <FreqDist with 7 samples and 7 outcomes>


Calculating Probabilities using MLE

In [61]:
shakespeare_bigram_prob = MLEProbDist(shakespeare_bigram_freq)
foreign_bigram_prob = MLEProbDist(foreign_bigram_freq)
print('Shakespeare  ',shakespeare_bigram_prob)
print('Foreign Language  ',foreign_bigram_prob)

Shakespeare   <MLEProbDist based on 9 samples>
Foreign Language   <MLEProbDist based on 7 samples>


Find the most likely next word

In [62]:
def predict_next_word(word, bigram_freq, bigram_prob):
    candidates = [(pair, bigram_prob.prob(pair)) for pair in bigram_freq if pair[0] == word]
    return max(candidates, key=lambda x: x[1])[0][1] if candidates else None

print("Most likely next word after 'he' (Shakespeare):", predict_next_word('he', shakespeare_bigram_freq, shakespeare_bigram_prob))
print('\n')
print("Most likely next word after 'le' (Foreign):", predict_next_word('le', foreign_bigram_freq, foreign_bigram_prob))

Most likely next word after 'he' (Shakespeare): seen


Most likely next word after 'le' (Foreign): vent


#Using Textblob Library

In [63]:
from textblob import TextBlob
from collections import Counter
from nltk.util import ngrams
from nltk.probability import FreqDist, MLEProbDist

In [64]:
shakespeare_corpus = "Oft hath he seen that wherefore he was fain."
foreign_corpus = "Le vent souffle fort sur la mer."

Tokenization

In [65]:
shakespeare_tokens = TextBlob(shakespeare_corpus.lower()).words
foreign_tokens = TextBlob(foreign_corpus.lower()).words

Creating N-grams

In [66]:
def generate_ngrams(tokens, n):
    return list(ngrams(tokens, n))

# Creating unigrams, bigrams, and trigrams
shakespeare_unigrams = generate_ngrams(shakespeare_tokens, 1)
shakespeare_bigrams = generate_ngrams(shakespeare_tokens, 2)
shakespeare_trigrams = generate_ngrams(shakespeare_tokens, 3)

foreign_unigrams = generate_ngrams(foreign_tokens, 1)
foreign_bigrams = generate_ngrams(foreign_tokens, 2)
foreign_trigrams = generate_ngrams(foreign_tokens, 3)

print("Shakespeare Unigrams:", shakespeare_unigrams)
print('\n')
print("Shakespeare Bigrams:", shakespeare_bigrams)
print('\n')
print("Shakespeare Trigrams:", shakespeare_trigrams)
print('\n')


print("Foreign Unigrams:", foreign_unigrams)
print('\n')
print("Foreign Bigrams:", foreign_bigrams)
print('\n')
print("Foreign Trigrams:", foreign_trigrams)



Shakespeare Unigrams: [('oft',), ('hath',), ('he',), ('seen',), ('that',), ('wherefore',), ('he',), ('was',), ('fain',)]


Shakespeare Bigrams: [('oft', 'hath'), ('hath', 'he'), ('he', 'seen'), ('seen', 'that'), ('that', 'wherefore'), ('wherefore', 'he'), ('he', 'was'), ('was', 'fain')]


Shakespeare Trigrams: [('oft', 'hath', 'he'), ('hath', 'he', 'seen'), ('he', 'seen', 'that'), ('seen', 'that', 'wherefore'), ('that', 'wherefore', 'he'), ('wherefore', 'he', 'was'), ('he', 'was', 'fain')]


Foreign Unigrams: [('le',), ('vent',), ('souffle',), ('fort',), ('sur',), ('la',), ('mer',)]


Foreign Bigrams: [('le', 'vent'), ('vent', 'souffle'), ('souffle', 'fort'), ('fort', 'sur'), ('sur', 'la'), ('la', 'mer')]


Foreign Trigrams: [('le', 'vent', 'souffle'), ('vent', 'souffle', 'fort'), ('souffle', 'fort', 'sur'), ('fort', 'sur', 'la'), ('sur', 'la', 'mer')]


Frequency Distribution

In [67]:
shakespeare_bigram_freq = FreqDist(shakespeare_bigrams)
foreign_bigram_freq = FreqDist(foreign_bigrams)
print('Shakespeare  ',shakespeare_bigram_freq)
print('Foreign Language  ',foreign_bigram_freq)

Shakespeare   <FreqDist with 8 samples and 8 outcomes>
Foreign Language   <FreqDist with 6 samples and 6 outcomes>


Calculating Probabilities using MLE

In [68]:
shakespeare_bigram_prob = MLEProbDist(shakespeare_bigram_freq)
foreign_bigram_prob = MLEProbDist(foreign_bigram_freq)
print('Shakespeare  ',shakespeare_bigram_prob)
print('Foreign Language  ',foreign_bigram_prob)

Shakespeare   <MLEProbDist based on 8 samples>
Foreign Language   <MLEProbDist based on 6 samples>


Find the most likely next word

In [69]:
def predict_next_word(word, bigram_freq, bigram_prob):
    candidates = [(pair, bigram_prob.prob(pair)) for pair in bigram_freq if pair[0] == word]
    return max(candidates, key=lambda x: x[1])[0][1] if candidates else None
print("Most likely next word after 'he' (Shakespeare):", predict_next_word('he', shakespeare_bigram_freq, shakespeare_bigram_prob))
print('\n')
print("Most likely next word after 'le' (Foreign):", predict_next_word('le', foreign_bigram_freq, foreign_bigram_prob))

Most likely next word after 'he' (Shakespeare): seen


Most likely next word after 'le' (Foreign): vent
