# Set up

In [2]:
!git clone https://github.com/NLP-Reichman/assignment_1.git
!mv assignment_1/data data
!rm assignment_1/ -r

fatal: destination path 'assignment_1' already exists and is not an empty directory.


'mv' is not recognized as an internal or external command,
operable program or batch file.
'rm' is not recognized as an internal or external command,
operable program or batch file.


# Introduction
In this assignment you will be creating tools for learning and testing language models. The corpora that you will be working with are lists of tweets in 8 different languages that use the Latin script. The data is provided either formatted as CSV or as JSON, for your convenience. The end goal is to write a set of tools that can detect the language of a given tweet.
The relevant files are under the data folder:

- en.csv (or the equivalent JSON file)
- es.csv (or the equivalent JSON file)
- fr.csv (or the equivalent JSON file)
- in.csv (or the equivalent JSON file)
- it.csv (or the equivalent JSON file)
- nl.csv (or the equivalent JSON file)
- pt.csv (or the equivalent JSON file)
- tl.csv (or the equivalent JSON file)

In [1]:
import json
# from google.colab import files
import pandas as pd
import glob
import numpy as np
import nltk
import os
import csv

# Implementation

## Part 1
Implement the function *preprocess* that iterates over all the data files and creates a single vocabulary, containing all the tokens in the data. Our token definition is a single UTF-8 encoded character. So, the vocabulary list is a simple Python list of all the characters that you see at least once in the data.

Note - do NOT lowercase the sentences in whi HW.

In [77]:
SOS = "ה"
EOS = "ס"
UNK = "ל"

def preprocess() -> list[str]:
    '''
    Return a list of characters, representing the shared vocabulary of all languages
    '''
    # Initialize an empty set to store unique tokens
    vocabulary = set()

    # Path to the data folder
    data_folder = 'assignment_1/data/'

    # Iterate over all files in the data folder
    for filename in os.listdir(data_folder):
        # Check if the file is a CSV or JSON file
        if filename.endswith('.csv'):
            with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
                reader = csv.reader(file)
                # Skip the header row
                next(reader)
                # Read tweet texts and extract tokens
                for row in reader:
                    tweet_text = row[1]  # Assuming tweet text is in the second column
                    vocabulary.update(tweet_text)
    # Convert the set to a sorted list of characters
    # vocabulary.add(UNK)
    # vocabulary.add(SOS)
    # vocabulary.add(EOS)
    vocabulary_list = sorted(list(vocabulary))
    
    return vocabulary_list

# Get the shared vocabulary of all languages
vocabulary = preprocess()
print(vocabulary)
print(len(vocabulary))

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x80', '\x91', '\x92', '\x9d', '¡', '£', '¤', '¥', '§', '¨', '©', 'ª', '«', '\xad', '®', '¯', '°', '²', '´', '¶', '·', '¸', 'º', '»', '½', '¿', 'À', 'Á', 'Â', 'Ã', 'Å', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Ì', 'Í', 'Î', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', '×', 'Ù', 'Ú', 'Ü', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ė', 'Ğ', 'ğ', 'İ', 'ı', 'ń', 'ō', 'Œ', 'œ', 'Ş', 'ş', 'Š', 'Ÿ', 'ƒ', 'ʔ', 'ʕ', 'ʖ', 'ʰ', 'ʳ', 'ʷ', 'ʸ', '˖', '˘', '˚',

## Part 2
Implement the function *lm* that generates a language model from a textual corpus. The function should return a dictionary (representing a model) where the keys are all the relevant *n*-1 sequences, and the values are dictionaries with the *n*_th tokens and their corresponding probabilities to occur. For example, for a trigram model (tokens are characters), it should look something like:

{ "ab":{"c":0.5, "b":0.25, "d":0.25}, "ca":{"a":0.2, "b":0.7, "d":0.1} }

which means for example that after the sequence "ab", there is a 0.5 chance that "c" will appear, 0.25 for "b" to appear and 0.25 for "d" to appear.

Note - You should think how to add the add_one smoothing information to the dictionary and implement it.

In [46]:
def lm(lang: str, n: int, smoothed: bool = False) -> dict[str, dict[str, float]]:
    '''
    Return a language model for the given lang and n-gram (n), with an option for smoothing.
    :param lang: the language of the model
    :param n: the n_gram value
    :param smoothed: boolean indicating whether to apply smoothing
    :return: a dictionary where the keys are n-1 grams and the values are dictionaries
    '''
    # Initialize an empty dictionary to store the language model
    language_model = {}

    # Path to the data file for the specified language
    data_file = 'assignment_1/data/' + lang + '.csv'  # Assuming the data file follows the naming convention
    
    # Read the CSV file using pandas
    df = pd.read_csv(data_file)

    # Iterate over the rows of the dataframe
    for index, row in df.iterrows():
        tweet_text = SOS + row['tweet_text'] + EOS  # Assuming 'tweet_text' is the column name
        # tweet_text = row['tweet_text']  # Assuming 'tweet_text' is the column name
        # Iterate over the text to build n-gram model
        for i in range(len(tweet_text) - n + 1):
            # Extract n-gram
            ngram = tweet_text[i:i+n]
            # Extract (n-1)-gram
            context = ngram[:n-1]
            # Extract token
            token = ngram[-1]
            # Update language model
            if context not in language_model:
                language_model[context] = {}
            if token not in language_model[context]:
                language_model[context][token] = 0
            language_model[context][token] += 1

    # Apply add-one smoothing if specified
    if smoothed:
        # Iterate over language model to apply smoothing
        for context in language_model:
            # Calculate total count of tokens for the current context
            total_count = sum(language_model[context].values())
            # Calculate vocabulary size
            vocabulary_size = len(language_model[context])
            # Apply add-one smoothing
            for token in language_model[context]:
                language_model[context][token] = (language_model[context][token] + 1) / (total_count + vocabulary_size)

    return language_model


# Get the language model for English with n=2 and no smoothing
lm_en2 = lm('en', 2, True)
print(lm_en2)
print(len(lm_en2))
lm_en3 = lm('en', 3, False)
print(lm_en3)
print(len(lm_en3))
lm_fr = lm('fr', 3, False)
print(lm_fr)
print(len(lm_fr))
lm_es = lm('es', 3, False)
print(lm_es)
print(len(lm_es))

{'ה': {'R': 0.5237414816443174, 'H': 0.012640140690261596, '@': 0.14387777533523852, '#': 0.017146625632007036, 'B': 0.008463398549131678, 'w': 0.002528028138052319, 'E': 0.004726313475489119, 'T': 0.02725873818421631, 'M': 0.010881512420312156, 'G': 0.006265113211694878, 'I': 0.03616179380083535, 'N': 0.010222026819081118, 'Y': 0.008903055616619037, 'P': 0.007144427346669598, 'A': 0.014948340294570235, '.': 0.0018685425368212794, 'S': 0.019344910969443834, 'i': 0.006704770279182238, '＠': 0.0009892284018465597, 'W': 0.015497911628929434, 'r': 0.0007693998681028798, 'D': 0.008793141349747197, 'O': 0.007254341613541437, '🙌': 0.0007693998681028798, 's': 0.002088371070564959, 'K': 0.0013189712024620796, 'C': 0.012310397889646076, '"': 0.005056056276104638, 'V': 0.0019784568036931194, '“': 0.0009892284018465597, 'o': 0.0034073422730270387, 'U': 0.0024181138711804793, 'L': 0.010881512420312156, 'F': 0.007803912947900638, '1': 0.003297428006155199, 'f': 0.0010991426687183997, '2': 0.002308199

## Part 3
Implement the function *eval* that returns the perplexity of a model (dictionary) running over the data file of the given target language.

In [76]:
def eval(model: dict, target_lang: str) -> float:
    '''
    Return the perplexity value calculated over applying the model on the text file
    of the target_lang language.
    :param model: the language model
    :param target_lang: the target language
    :return: the perplexity value
    '''
    # Path to the data file for the target language
    data_file = 'assignment_1/data/' + target_lang + '.csv'  # Assuming the data file follows the naming convention

    # Read the CSV file using pandas
    df = pd.read_csv(data_file)

    # Initialize variables to store total log probability and total number of tokens
    total_log_probability = 0
    total_tokens = 0

    # Iterate over the rows of the dataframe
    for index, row in df.iterrows():
        tweet_text = row['tweet_text']  # Assuming 'tweet_text' is the column name
        # Iterate over the text to calculate log probability
        total_log_probability += calculate_perplexity(tweet_text, model)
        # total_tokens += len(tweet_text)
        # print(total_log_probability)
    # Calculate perplexity
    perplexity_value = np.exp(-total_log_probability / total_tokens)
    print(perplexity_value)
    

    return perplexity_value

def calculate_perplexity(tweet_text: str, model: dict) -> float:
    '''
    Calculate the total log probability of applying the model on the given text.
    :param tweet_text: the text to apply the model on
    :param model: the language model
    :return: the total log probability
    '''
    # Initialize total log probability
    total_log_probability = 0
    n = len(next(iter((model))))

    # Iterate over the text to calculate log probability
    for i in range(len(tweet_text) - n + 1):
        # Extract n-gram
        ngram = tweet_text[i:i+len(model)]
        # Extract context (n-1 gram)
        context = ngram[:-1]
        # Extract token
        token = ngram[-1]
        # Check if the context exists in the model
        if context in model:
            # Check if the token exists in the context
            if token in model[context]:
                # Calculate log probability and add it to total_log_probability
                total_log_probability += model[context][token]
            else:
                # If the token doesn't exist in the context, add log probability of a small value
                total_log_probability += np.log(1e-10)
        else:
            # If the context doesn't exist in the model, add log probability of a small value
            total_log_probability += np.log(1e-10)
    print("total_log_probability", total_log_probability)
    perp = np.exp(-total_log_probability)
    print(perp)
    perp = 1/perp
    perp = perp**(1/len(tweet_text))
    print(perp)

    return perp


# Evaluate the English language model on the English text file
perplexity_en2 = eval(lm_en2, 'en')
# print(perplexity_en2)
# perplexity_en3 = eval(lm_en3, 'en')
# print(perplexity_en3)


total_log_probability -1576.5060828185697
inf
0.0
total_log_probability -932.5415107257147
inf
0.0
total_log_probability -1139.7764848608122
inf
0.0
total_log_probability -1162.8006613787088
inf
0.0
total_log_probability -518.0463975195528
9.653655856946301e+224
1.2853708269805517e-05
total_log_probability -840.4245372036952
inf
0.0
total_log_probability -1013.1298024146419
inf
0.0
total_log_probability -990.1089545555199
inf
0.0
total_log_probability -472.018202380805
9.883269815766264e+204
1.3157368315189567e-05
total_log_probability -1312.45741920588
inf
0.0
total_log_probability -1496.4277475383951
inf
0.0
total_log_probability -587.1565809124345
9.97385622406046e+254
1.2478882904901695e-05
total_log_probability -1093.7182980056039
inf
0.0
total_log_probability -356.4006894140772
6.065306597127007e+154
1.455579234594107e-05
total_log_probability -1565.7491513504513
inf
0.0
total_log_probability -1254.8993027804288
inf
0.0
total_log_probability -863.4688591513209
inf
0.0
total_log_p

  perp = np.exp(-total_log_probability)


total_log_probability -1358.524612722267
inf
0.0
total_log_probability -1600.2949743519289
inf
0.0
total_log_probability -575.5910705128006
9.462932810639961e+249
1.2546116006750247e-05
total_log_probability -1358.5134931547752
inf
0.0
total_log_probability -1059.173592681897
inf
0.0
total_log_probability -1600.277430265267
inf
0.0
total_log_probability -1600.2941382908596
inf
0.0
total_log_probability -414.38688638549803
9.245664505710747e+179
1.3679043360727744e-05
total_log_probability -863.3887445921007
inf
0.0
total_log_probability -909.507120569171
inf
0.0
total_log_probability -506.3687204586903
8.187307530781786e+219
1.2973026607132124e-05
total_log_probability -1588.780826913114
inf
0.0
total_log_probability -817.4140416517487
inf
0.0
total_log_probability -1381.5481775069538
inf
0.0
total_log_probability -1082.1846556790308
inf
0.0
total_log_probability -1600.2918818453961
inf
0.0
total_log_probability -1600.2945066077878
inf
0.0
total_log_probability -1151.244209698685
inf
0

  perplexity_value = np.exp(-total_log_probability / total_tokens)


## Part 4
Implement the *match* function that calls *eval* using a specific value of *n* for every possible language pair among the languages we have data for. You should call *eval* for every language pair four times, with each call assign a different value for *n* (1-4). Each language pair is composed of the source language and the target language. Before you make the call, you need to call the *lm* function to create the language model for the source language. Then you can call *eval* with the language model and the target language. The function should return a pandas DataFrame with the following four columns: *source_lang*, *target_lang*, *n*, *perplexity*. The values for the first two columns are the two-letter language codes. The value for *n* is the *n* you use for generating the specific perplexity values which you should store in the forth column.

In [7]:
def match() -> pd.DataFrame:
  '''
  Return a DataFrame containing one line per every language pair and n_gram.
  Each line will contain the perplexity calculated when applying the language model
  of the source language on the text of the target language.
  :return: a DataFrame containing the perplexity values
  '''
  return pd.DataFrame()

## Part 5
Implement the *generate* function which takes a language code, *n*, the prompt (the starting text), the number of tokens to generate, and *r*, which is the random seed for any randomized action you plan to take in your implementation. The function should start generating tokens, one by one, using the language model of the given source language and *n*. The prompt should be used as a starting point for aligning on the probabilities to be used for generating the next token.

Note - The generation of the next token should be from the LM's distribution.

In [8]:
def generate(lang: str, n: int, prompt: str, number_of_tokens: int, r: int) -> str:
  '''
  Generate text in the given language using the given parameters.
  :param lang: the language of the model
  :param n: the n_gram value
  :param prompt: the prompt to start the generation
  :param number_of_tokens: the number of tokens to generate
  :param r: the random seed to use
  '''
  return ""

## Part 6
Play with your generate function, try to generate different texts in different language and various values of *n*. No need to submit anything of that.

# Testing

Copy the content of the **tests.py** file from the repo and paste below. This will create the results.json file and download it to your machine.

In [9]:
####################
# PLACE TESTS HERE #

# Create tests
def test_preprocess():
    return {
        'vocab_length': len(preprocess()),
    }

def test_lm():
    return {
        'english_2_gram_length': len(lm('en', 2)),
        'english_3_gram_length': len(lm('en', 3)),
        'french_3_gram_length': len(lm('fr', 3)),
        'spanish_3_gram_length': len(lm('es', 3)),
    }

def test_eval():
    return {
        'english_on_english': round(eval(lm('en', 3), 'en'), 2),
        'english_on_french': round(eval(lm('en', 3), 'fr'), 2),
        'english_on_spanish': round(eval(lm('en', 3), 'es'), 2),
    }

def test_match():
    df = match()
    return {
        'df_shape': df.shape,
        'en_en_1': df[(df['source'] == 'en') & (df['target'] == 'en') & (df['n'] == 1)]['perplexity'].values[0],
        'tl_tl_1': df[(df['source'] == 'tl') & (df['target'] == 'tl') & (df['n'] == 1)]['perplexity'].values[0],
        'tl_nl_4': df[(df['source'] == 'tl') & (df['target'] == 'nl') & (df['n'] == 4)]['perplexity'].values[0],
    }

def test_generate():
    return {
        'english_2_gram': generate('en', 2, "I am", 20, 5),
        'english_3_gram': generate('en', 3, "I am", 20, 5),
        'english_4_gram': generate('en', 4, "I Love", 20, 5),
        'spanish_2_gram': generate('es', 2, "Soy", 20, 5),
        'spanish_3_gram': generate('es', 3, "Soy", 20, 5),
        'french_2_gram': generate('fr', 2, "Je suis", 20, 5),
        'french_3_gram': generate('fr', 3, "Je suis", 20, 5),
    }

TESTS = [test_preprocess, test_lm, test_eval, test_match, test_generate]

# Run tests and save results
res = {}
for test in TESTS:
    print(f'Running test: {test.__name__}')
    try:
        cur_res = test()
        res.update({test.__name__: cur_res})
    except Exception as e:
        res.update({test.__name__: repr(e)})


# save the result file as results.json
with open('results.json', 'w') as f:
    json.dump(res, f, indent=2)


# with open('results.json', 'w') as f:
#     json.dump(res, f, indent=2)

# Download the results.json file
# files.download('results.json')

####################

In [10]:
# Show the local files, results.json should be there now and
# also downloaded to your local machine
!ls -l

'ls' is not recognized as an internal or external command,
operable program or batch file.
