# Training a bigram model with a Greek corpus
- using spacy within a jupyter notebook
- we will observe the probabilities of 2 test sample Greek sentences against the model to calculate its probability of occurring given the vocabulary and their individual perplexities
- we will finally use the trained model to generate sentences in Greek

In [28]:
import spacy
from spacy.lang.el import Greek
import re
import math
import random
# import files
from typing import Tuple
from google.colab import files

# this corpus is not pasting correctly from HW instructions, therefore some extra spaces were added to aid tokenization
ori_texts = '''Τρώω φρούτα κάθε μέρα.
 Τρώω ψωμί με βούτυρο.
 Τρώω ένα μήλο το πρωί.
 Τρώω σαλάτα με ντομάτα.
 Τρώω ζυμαρικά για μεσημεριανό.
 Τρώω κοτόπουλο για δείπνο.
 Τρώω αυγά το Σαββατοκύριακο.
 Τρώω ρύζι με λαχανικά.
 Τρώω γιαούρτι με μέλι.
 Τρώω τυρί και ελιές.
 Αγαπώ τα φρούτα, ειδικά τα μήλα.
 Έχεις ψωμί;
 Κάθε πρωί τρώω ένα μήλο.
 Θέλω να φάω σαλάτα με ντομάτες.
 Η σούπα έχει κοτόπουλο και λαχανικά.
 Πόσο κοστίζει το ψωμί;
 Πίνω γάλα με το μήλο μου.
 Μετά το δείπνο, τρώμε συχνά φρούτα.
 Στη σαλάτα, βάζω πάντα ντομάτες.
 Ποιος έφαγε το ψωμί;
 Το κοτόπουλο είναι το αγαπημένο μου φαγητό.
 Μπορώ να έχω λίγο περισσότερο ψωμί, παρακαλώ;
 Αυτή η μηλόπιτα είναι πολύ νόστιμη.
 Πού αγόρασες αυτά τα φρέσκα φρούτα;
 Φτιάχνω σαλάτα με ντομάτες και αγγούρια.
 Θα πιείτε γάλα ή χυμό;
 Τα παιδιά δεν τρώνε πολύ ψωμί.
 Αυτές οι ντομάτες είναι πολύ ώριμες.
 Στο πρωινό μου πάντα έχω ένα μήλο.
 Τα φρούτα είναι καλά για την υγεία.'''
print(ori_texts)

# using spacy's pretrained Greek model
# !pip install https://github.com/explosion/spacy-models/releases/download/el_core_news_sm-3.5.0/el_core_news_sm-3.5.0.tar.gz
nlp = spacy.load("el_core_news_sm")

# simply remove new lines... keeping all punctuation marks for the sentence generation that will happen in Q4
preprocessed_texts = ori_texts.lower().replace('\n', '')

## HELPER METHODS
def tokenize(corpus: str) -> list:
  doc = nlp(corpus)
  tokens = [token.text for token in doc]
  # print(tokens)
  return tokens

def get_unigram_count(tokens: list) -> dict:
  res = {}  # dictionary for unigram counts
  for token in tokens:
      res[token] = res.get(token, 0) + 1

  return res

def get_bigram_count(tokens: list) -> dict:
  res = {}   # dictionary for bigram counts
      # count bigrams
  for i in range(len(tokens) - 1):
      bigram = (tokens[i], tokens[i + 1])
      res[bigram] = res.get(bigram, 0) + 1 # keep a frequency count of each bigram within the corpus
  return res

# DONT NEED THIS METHOD?
# def add_all_possible_bigrams(vocabulary, bigram_count_map: dict) -> dict:
#   '''your code here'''
#   return bigram_count_map

def get_probability_smoothing(count_map: dict, v: int, N: int) -> dict:
  res = {key: (count + 1) / (N + v) for key, count in count_map.items()}
  return res

def estimate_probability_preplexity_product(sentence, unigram_count_map, bigram_count_map: dict, V: int) -> Tuple[float, float]:
  tokens = tokenize(sentence) # tokenize sentence
  log_prob_sum = 0.0

  # iterate through all tokens and calculate bigram probabilities
  for i in range(len(tokens)-1):
      bigram = (tokens[i], tokens[i+1])
      prob = bigram_count_map.get(bigram, 1/V)
      log_prob_sum += math.log(prob)

  # json file needs actual prob, not log sum prob
  actual_prob = math.exp(log_prob_sum)  # convert log probability to actual probability
  perplexity = actual_prob ** (-1 / len(tokens))  # calculate perplexity of sentence

  return actual_prob, perplexity  #return Tuple[float, float]

# This method is never called on by the automated graded template below, therefore I just commented it out and did both log sum probability + perplexity in one method
# def estimate_probability_perplexity_log_sum(sentence, unigram_count_map, bigram_count_map: dict, V: int) -> Tuple[float, float]:
#   '''your code here'''
#   return Tuple[float, float]

def sample_sentence(unigram_count_map, bigram_count_map: dict, V: int, N: int) -> str:
    # Sample first word based on unigram probability
    trained_words = list(unigram_count_map.keys())
    probabilities = [count / N for count in unigram_count_map.values()]
    first_word = random.choices(trained_words, probabilities)[0] # ensures the random sampling is used to get the first word
    generated_sentence = [first_word]

    while first_word != ".":
        # Get possible next words based on bigram probability

        # Find all bigrams where first_word is the first word. Gets a dictionary of all the possible succeeding words where
        # the first word appears first in a bigram from the training corpus + how many times that bigram appears in the corpus overall
        next_words = {bigram[1]: count for bigram, count in bigram_count_map.items() if bigram[0] == first_word}

        if not next_words:  # no bigram transition is found, so we will end the loop because this word has no possible next_word
            break

        # get all normal bigram probabilities of the next_words
        total_count = sum(next_words.values())
        next_probs = [count / total_count for count in next_words.values()]

        # sample next word, having the probabilities calculated above randomly selected, then append to final string
        first_word = random.choices(list(next_words.keys()), next_probs)[0]
        generated_sentence.append(first_word)

    return " ".join(generated_sentence).replace(" .", ".")

Τρώω φρούτα κάθε μέρα.
 Τρώω ψωμί με βούτυρο.
 Τρώω ένα μήλο το πρωί.
 Τρώω σαλάτα με ντομάτα.
 Τρώω ζυμαρικά για μεσημεριανό.
 Τρώω κοτόπουλο για δείπνο.
 Τρώω αυγά το Σαββατοκύριακο.
 Τρώω ρύζι με λαχανικά.
 Τρώω γιαούρτι με μέλι.
 Τρώω τυρί και ελιές.
 Αγαπώ τα φρούτα, ειδικά τα μήλα.
 Έχεις ψωμί;
 Κάθε πρωί τρώω ένα μήλο.
 Θέλω να φάω σαλάτα με ντομάτες.
 Η σούπα έχει κοτόπουλο και λαχανικά.
 Πόσο κοστίζει το ψωμί;
 Πίνω γάλα με το μήλο μου.
 Μετά το δείπνο, τρώμε συχνά φρούτα.
 Στη σαλάτα, βάζω πάντα ντομάτες.
 Ποιος έφαγε το ψωμί;
 Το κοτόπουλο είναι το αγαπημένο μου φαγητό.
 Μπορώ να έχω λίγο περισσότερο ψωμί, παρακαλώ;
 Αυτή η μηλόπιτα είναι πολύ νόστιμη.
 Πού αγόρασες αυτά τα φρέσκα φρούτα;
 Φτιάχνω σαλάτα με ντομάτες και αγγούρια.
 Θα πιείτε γάλα ή χυμό;
 Τα παιδιά δεν τρώνε πολύ ψωμί.
 Αυτές οι ντομάτες είναι πολύ ώριμες.
 Στο πρωινό μου πάντα έχω ένα μήλο.
 Τα φρούτα είναι καλά για την υγεία.


In [31]:
# Get your homework answers
Q1_answer = tokenize(preprocessed_texts)
print(Q1_answer)
Q2_uni_gram_count = get_unigram_count(Q1_answer)
print(Q2_uni_gram_count)
Q2_bi_gram_count = get_bigram_count(Q1_answer)
print(Q2_bi_gram_count)
V = len(Q2_uni_gram_count) # get every possible word that exists in ori_text corpus
N = sum(Q2_uni_gram_count.values()) # get the total count of every unigram (so duplicates included)
print("V: ", V)
print("N: ", N)
smooth_count_uni = get_probability_smoothing(Q2_uni_gram_count, V, N)
print("smooth_count_uni: ", smooth_count_uni)

smooth_count_bi = get_probability_smoothing(Q2_bi_gram_count, V, N)
print("Q2_bi_gram_count: ", Q2_bi_gram_count)

sentence_1 = '''Τρώω τυρί και ελιές τα σαββατοκύριακα.'''
print("sentece 1: ", sentence_1)
sentence_2 = '''Στην Ελλάδα, οι άνθρωποι απολαμβάνουν μια πλούσια ποικιλία τροφίμων που περιλαμβάνει φρέσκα θαλασσινά, λαχταριστά παραδοσιακά πιάτα όπως μουσακά και σουβλάκι, αρωματικά μπαχαρικά και βότανα, καθώς και μια εκπληκτική ποικιλία τυριών και ελιών, απολαμβάνοντας το φαγητό τους με καλό κρασί ή ούζο.'''
print("sentece 2: ", sentence_2)


Q3_sentence1_golden = estimate_probability_preplexity_product(sentence_1, smooth_count_uni, smooth_count_bi, V)
print("Q3_sentence1_golden: ", Q3_sentence1_golden)

Q3_sentence2_golden = estimate_probability_preplexity_product(sentence_2, smooth_count_uni, smooth_count_bi, V)
print("Q3_sentence2_golden: ", Q3_sentence2_golden)

Q4_sentence = sample_sentence(Q2_uni_gram_count, Q2_bi_gram_count, V, N)
print("q4 sentence is: ", Q4_sentence)
Q4_code = '''
def sample_sentence(unigram_count_map, bigram_count_map: dict, V: int, N: int) -> str:
    # Sample first word based on unigram probability
    trained_words = list(unigram_count_map.keys())
    probabilities = [count / N for count in unigram_count_map.values()]
    first_word = random.choices(trained_words, probabilities)[0] # ensures the random sampling is used to get the first word
    generated_sentence = [first_word]

    while first_word != ".":
        # Get possible next words based on bigram probability

        # Find all bigrams where first_word is the first word. Gets a dictionary of all the possible succeeding words where
        # the first word appears first in a bigram from the training corpus + how many times that bigram appears in the corpus overall
        next_words = {bigram[1]: count for bigram, count in bigram_count_map.items() if bigram[0] == first_word}

        if not next_words:  # no bigram transition is found, so we will end the loop because this word has no possible next_word
            break

        # get all normal bigram probabilities of the next_words
        total_count = sum(next_words.values())
        next_probs = [count / total_count for count in next_words.values()]

        # sample next word, having the probabilities calculated above randomly selected, then append to final string
        first_word = random.choices(list(next_words.keys()), next_probs)[0]
        generated_sentence.append(first_word)

    return " ".join(generated_sentence).replace(" .", ".")
''' # just copy your sample_sentence code here as a string
print(Q4_code)


['τρώω', 'φρούτα', 'κάθε', 'μέρα', '.', 'τρώω', 'ψωμί', 'με', 'βούτυρο', '.', 'τρώω', 'ένα', 'μήλο', 'το', 'πρωί', '.', 'τρώω', 'σαλάτα', 'με', 'ντομάτα', '.', 'τρώω', 'ζυμαρικά', 'για', 'μεσημεριανό', '.', 'τρώω', 'κοτόπουλο', 'για', 'δείπνο', '.', 'τρώω', 'αυγά', 'το', 'σαββατοκύριακο', '.', 'τρώω', 'ρύζι', 'με', 'λαχανικά', '.', 'τρώω', 'γιαούρτι', 'με', 'μέλι', '.', 'τρώω', 'τυρί', 'και', 'ελιές', '.', 'αγαπώ', 'τα', 'φρούτα', ',', 'ειδικά', 'τα', 'μήλα', '.', 'έχεις', 'ψωμί', ';', 'κάθε', 'πρωί', 'τρώω', 'ένα', 'μήλο', '.', 'θέλω', 'να', 'φάω', 'σαλάτα', 'με', 'ντομάτες', '.', 'η', 'σούπα', 'έχει', 'κοτόπουλο', 'και', 'λαχανικά', '.', 'πόσο', 'κοστίζει', 'το', 'ψωμί', ';', 'πίνω', 'γάλα', 'με', 'το', 'μήλο', 'μου', '.', 'μετά', 'το', 'δείπνο', ',', 'τρώμε', 'συχνά', 'φρούτα', '.', 'στη', 'σαλάτα', ',', 'βάζω', 'πάντα', 'ντομάτες', '.', 'ποιος', 'έφαγε', 'το', 'ψωμί', ';', 'το', 'κοτόπουλο', 'είναι', 'το', 'αγαπημένο', 'μου', 'φαγητό', '.', 'μπορώ', 'να', 'έχω', 'λίγο', 'περισσότερ

In [32]:
Q2_bi_gram_count_str_key = {i[0][0]+'|'+i[0][1]:i[1] for i in Q2_bi_gram_count.items()}
result_dict = {
"UNI": "hbh180000", # Enter your university ID (e.g., "kxa190001"), ensure it's a string
'First_name': "Haniyyah", # Enter your first name here, ensure it's a string
"Last_name": "Hamid", # Enter your last name here, ensure it's a string
'Q1': Q1_answer, # Q1 answer should be a list of tokens
'Q2_uni': Q2_uni_gram_count, # Q2_uni should be a dict, keys are the tokens and values are int
                            # - Key: Word as a string (e.g., "")
                            # - Value: Frequency count (in
'Q2_bi': Q2_bi_gram_count_str_key, # Q2_bi should be a dict, keys are strings of token and values are int
                                  # - Key: Tuple of "previous_word, next_word" this is a string
                                  # two tokens are separate with a "|", Make sure the key is a string
                                  # - Value: Co-occurrence count (int)
'Q3_sentence1': Q3_sentence1_golden,# List [Probability, perplexity]
'Q3_sentence2': Q3_sentence2_golden,# List [Probability, perplexity]
'Q4_sentence': Q4_sentence, # Your generated sentence, should be a string.
'Q4_code': Q4_code # Your code for Q4 to generate the sentence, should be a string
}


In [33]:
# Check your dict
def validate_result_dict(result_dict):
  """
  Validate the structure and types of the result_dict.
  Returns:
  Tuple[bool, str]: (True, "Validation Passed") if all checks pass,
  (False, "Error message") otherwise.
  """
  expected_types = {
  "UNI": str,
  "First_name": str,
  "Last_name": str,
  "Q1": list, # Should be a list of tokens (strings)
  "Q2_uni": dict, # Should be a dictionary {str: int}
  "Q2_bi": dict, # Should be a dictionary {str: int}, keys are formatted as "word1|word2"
  "Q3_sentence1": (list, tuple), # Should be a list or tuple [float, float] (Probability, Perplexity)
  "Q3_sentence2": (list, tuple), # Should be a list or tuple [float,float] (Probability, Perplexity)
  "Q4_sentence": str, # Should be a string
  "Q4_code": str # Should be a string
  }

  # Check if all required keys exist
  missing_keys = [key for key in expected_types if key not in result_dict]
  if missing_keys:
    return False, f"Missing keys in result_dict: {missing_keys}"
  3
  # Check types of each key
  for key, expected_type in expected_types.items():
    if not isinstance(result_dict[key], expected_type):
      return False, f"Key '{key}' has incorrect type. Expected {expected_type}, got {type(result_dict[key])}."

  # Check if Q1 is a list of strings
  if not all(isinstance(item, str) for item in result_dict["Q1"]):
    return False, "Q1 should be a list of strings (tokens)."
  # Check if Q2_uni is a dictionary with {str: int}
  if not all(isinstance(k, str) and isinstance(v, int) for k, v in result_dict["Q2_uni"].items()):
    return False, "Q2_uni should be a dictionary with string keys and integer values."
  # Check if Q2_bi is a dictionary with {str: int} and keys contain "|"
  if not all(isinstance(k, str) and "|" in k and isinstance(v, int) for k, v in result_dict["Q2_bi"].items()):
    return False, "Q2_bi should be a dictionary with keys as 'word1|word2' strings and integer values."
  # Check if Q3_sentence1 and Q3_sentence2 are lists or tuples of two floats
  for key in ["Q3_sentence1", "Q3_sentence2"]:
    if not (isinstance(result_dict[key], (list, tuple)) and len(result_dict[key]) == 2 and all(isinstance(x, (int, float)) for x in result_dict[key])):
      return False, f"{key} should be a list or tuple of two numeric values (Probability, Perplexity)."
    return True, "Validation Passed"
  result = validate_result_dict(result_dict)
  if result[0]:
    print(" Validation Passed!")
  else:
    print(f" Validation Failed: {result[1]}")



In [36]:
# Save and submit your result
import json
# Convert dictionary to JSON string
json_data = json.dumps(result_dict, indent=4) # Use indent for pretty-printing
# Save JSON to a file
with open("assignment1_answer.json", "w") as json_file: # Don't change the name here

  json.dump(result_dict, json_file, indent=4, ensure_ascii=False) # Use indent for pretty-printing

files.download("assignment1_answer.json")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>