# Set up

In [15]:
!git clone https://github.com/NLP-Reichman/assignment_1.git
!mv assignment_1/data data
!rm assignment_1/ -r

fatal: destination path 'assignment_1' already exists and is not an empty directory.
'mv' is not recognized as an internal or external command,
operable program or batch file.
'rm' is not recognized as an internal or external command,
operable program or batch file.


# Introduction
In this assignment you will be creating tools for learning and testing language models. The corpora that you will be working with are lists of tweets in 8 different languages that use the Latin script. The data is provided either formatted as CSV or as JSON, for your convenience. The end goal is to write a set of tools that can detect the language of a given tweet.
The relevant files are under the data folder:

- en.csv (or the equivalent JSON file)
- es.csv (or the equivalent JSON file)
- fr.csv (or the equivalent JSON file)
- in.csv (or the equivalent JSON file)
- it.csv (or the equivalent JSON file)
- nl.csv (or the equivalent JSON file)
- pt.csv (or the equivalent JSON file)
- tl.csv (or the equivalent JSON file)

In [16]:
import json
# from google.colab import files
import pandas as pd
import numpy as np
import glob
import os
import math
from collections import Counter


# Implementation

## Part 1
Implement the function *preprocess* that iterates over all the data files and creates a single vocabulary, containing all the tokens in the data. Our token definition is a single UTF-8 encoded character. So, the vocabulary list is a simple Python list of all the characters that you see at least once in the data.

Note - do NOT lowercase the sentences in whi HW.

In [17]:
# # load the csv file
# df = pd.read_csv('assignment_1/data/en.csv')
# df.head()
# np.unique(df['tweet_text'].str.cat(sep=''))
# len(''.join(set(df['tweet_text'].str.cat(sep=''))))



In [24]:
SOS = "<start> "
EOS = "</end>"
UNK = "<UNK>"

def preprocess() -> list[str]:
  '''
  Return a list of characters, representing the shared vocabulary of all languages
  '''
  df = pd.concat(map(pd.read_csv, glob.glob('assignment_1/data/*.csv')))
  # add SOS to the beginning of each tweet and EOS to the end
  df['tweet_text'] = SOS + df['tweet_text'] + EOS
  print(df.head())
  unique_chars = list(''.join(set(df['tweet_text'].str.cat(sep=''))))

  unique_chars = [SOS, EOS, UNK] + unique_chars
  return unique_chars

unique_chars = preprocess()
V = len(unique_chars)
V


             tweet_id                                         tweet_text
0  845395018743459840  <start> RT @ONHERPERlOD: Boyfriends that take ...
1  845395017917173760  <start> He got his surgery done today but he's...
2  845395018760306693  <start> @levi_a1998 @mcluber29 I'm doing so mu...
3  845395018336649216  <start> RT @Rt_YourFavBands: #BandsTournament2...
4  845395018751856642  <start> #Merlin oh no she wanted to enchant hi...


1806

## Part 2
Implement the function *lm* that generates a language model from a textual corpus. The function should return a dictionary (representing a model) where the keys are all the relevant *n*-1 sequences, and the values are dictionaries with the *n*_th tokens and their corresponding probabilities to occur. For example, for a trigram model (tokens are characters), it should look something like:

{ "ab":{"c":0.5, "b":0.25, "d":0.25}, "ca":{"a":0.2, "b":0.7, "d":0.1} }

which means for example that after the sequence "ab", there is a 0.5 chance that "c" will appear, 0.25 for "b" to appear and 0.25 for "d" to appear.

Note - You should think how to add the add_one smoothing information to the dictionary and implement it.

In [30]:
def lm(lang: str, n: int) -> dict[str, dict[str, float]]:
    '''
    Return a language model for the given lang and n_gram (n)
    :param lang: the language of the model
    :param n: the n_gram value
    :return: a dictionary where the keys are n_grams and the values are dictionaries
    '''
    df = pd.read_csv('assignment_1/data/' + lang + '.csv')
    # print(df.head())
    text = ''.join(df['tweet_text'].str.cat(sep=''))

    lm = {}

    for i in range(len(text)-n):
        ngram = text[i:i+n-1]
        suffix = text[i+n-1]

        if ngram not in lm:
            lm[ngram] = {}
            lm[ngram][suffix] = 1

        else:
            if suffix in lm[ngram]:
                lm[ngram][suffix] += 1
            else:
                lm[ngram][suffix] = 1

    # print(lm)

    for ngram in lm:
        sum_ngram = sum(lm[ngram].values())
        for suffix in lm[ngram]:
            lm[ngram][suffix] = lm[ngram][suffix] / sum_ngram

    # print(lm)

    return lm


ngrams = lm('en', 3)


In [20]:
# test_model = lm('en', 3)
# test_prompt = 'how are you'
# print(sorted(test_model[test_prompt[-2:]].items(), key=lambda x: x[1], reverse=True))
# word = np.random.choice(list(test_model[test_prompt[-2:]].keys()), 10, list(test_model[test_prompt[-2:]].values()))
# print(word)
# freq_dict = {}
# for i in range(10000):
#     word = np.random.choice(list(test_model[test_prompt[-2:]].keys()), 1, list(test_model[test_prompt[-2:]].values()))
#     word = str(word[0])
#     if word not in freq_dict:
#         freq_dict[word] = 1
#     else: freq_dict[word] += 1

# print(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True))

# print(type(str(word[0])))


In [21]:

# freq_dict = {}
# for i in range(10000):
#     word = random_from_distribution_dict(test_model[test_prompt[-2:]])
#     if word not in freq_dict:
#         freq_dict[word] = 1
#     else: freq_dict[word] += 1

# print(sorted(freq_dict.items(), key=lambda x: x[1], reverse=True))


In [28]:
math.log(1e-100)+math.log(1e-100)

-460.51701859880916

In [29]:
math.exp(-460)

1.6770203186015345e-200

## Part 3
Implement the function *eval* that returns the perplexity of a model (dictionary) running over the data file of the given target language.

In [22]:
def eval(model: dict, target_lang: str) -> float:
  '''
  Return the perplexity value calculated over applying the model on the text file
  of the target_lang language.
  :param model: the language model
  :param target_lang: the target language
  :return: the perplexity value
  '''
  df = pd.read_csv('assignment_1/data/' + target_lang + '.csv')
  # for row in df.iterrows():
  #   print(row)
  sum_perplexity = 0
  for index, row in df.iterrows():
    # print(row['tweet_text'])
    sum_perplexity += perplexity3(row['tweet_text'], model)
  
  sum_perplexity /= (len(df) - 1)
  return sum_perplexity

def perplexity(sentence, model):
  n = len(next(iter((model))))
  length = len(sentence)
  perp = 1

  for i in range(len(sentence)-n):
    ngram = sentence[i:i+n]
    suffix = sentence[i+n]
    # print(ngram, suffix)

    if ngram in model:
      if suffix in model[ngram]:
        perp *= model[ngram][suffix]
      else: perp *= 1/V
    else: perp *= 1/V
  print(sentence)
  # if perp == 0: return 1
  perp = 1/perp
  perp = perp**(1/length)
  return perp

def perplexity2(sentence, model):
  n = len(next(iter((model))))
  length = len(sentence)
  min_prob = math.log(1/V)
  perp = 0.0

  for i in range(len(sentence)-n):
    ngram = sentence[i:i+n]
    suffix = sentence[i+n]
    # print(ngram, suffix)

    if ngram in model:
      if suffix in model[ngram]:
        perp += np.log(model[ngram][suffix])
      else: perp += min_prob
    else: perp += min_prob
  perp = np.exp(perp)
  perp = perp/length
  return perp
  
def perplexity3(sentence, ngram_model):
    n = len(list(ngram_model.keys())[0])  # Get the value of n from the ngram model
    sentence = '<' + sentence + '>'  # Add < and > as start and end symbols

    # Initialize variables to store the total log probability and count of n-grams
    log_probability_sum = 0
    ngram_count = 0

    # Iterate over the sentence to calculate the perplexity
    for i in range(len(sentence) - n + 1):
        # Get the n-gram from the sentence
        ngram = sentence[i:i+n]
        
        # Check if the n-gram exists in the model
        if ngram in ngram_model:
            # Get the probability distribution for the n-gram
            probabilities = ngram_model[ngram]
            
            # Check if the next character exists in the probability distribution
            next_char = sentence[i+n]
            if next_char in probabilities:
                # Calculate the log probability of the n-gram
                log_probability = math.log(probabilities[next_char])
                log_probability_sum += log_probability
                ngram_count += 1
            else:
                # If the next character is not in the distribution, add a small value to avoid log(0)
                log_probability_sum += math.log(1e-10)
                ngram_count += 1
        else:
            # If the n-gram is not in the model, add a small value to avoid log(0)
            log_probability_sum += math.log(1e-10)
            ngram_count += 1

    # Calculate the average log probability
    average_log_probability = log_probability_sum / ngram_count

    # Calculate perplexity
    perplexity = math.exp(-average_log_probability)

    return perplexity


# perplexity2('RT @_xM_G_W_Vx_: ＲＥＴＷＥＥＴ  ＯＮＬＹ  ＩＦ   ＹＯＵ  ✨Ｆ✨ 〡✨Ｏ✨ 〡〡✨Ｌ✨ 〡〡〡✨Ｌ✨ 〡〡✨Ｏ✨ 〡✨Ｗ✨ ✨Ｂ✨ 〡✨Ａ✨ 〡〡✨Ｃ✨ 〡〡〡✨Ｋ✨  #F4F #MGWV #RT2GAIN #FOLLOWTRICK  #FOLLOW…', ngrams)
# perplexity2('RT @היי היי', ngrams)
perplexity2('hello there friend', lm)
# print("eval(ngrams, 'en') ", eval(lm('en', 3), 'en'))
# print("eval(ngrams, 'es') ", eval(lm('es', 3), 'es'))
# print("eval(ngrams, 'fr') ", eval(lm('fr', 3), 'fr'))
# print("eval(ngrams, 'pt') ", eval(lm('pt', 3), 'pt'))
# print("eval(ngrams, 'it') ", eval(lm('it', 3), 'it'))
# print()
# print("eval(ngram_false, 'en') ", eval(lm('en', 3), 'en'))
# print("eval(ngram_false, 'en') ", eval(lm('en', 3), 'es'))
# print("eval(ngram_false, 'en') ", eval(lm('en', 3), 'fr'))
# print("eval(ngram_false, 'en') ", eval(lm('en', 3), 'pt'))
# print("eval(ngram_false, 'en') ", eval(lm('en', 3), 'it'))
# print()
# print("eval(ngram_false, 'es') ", eval(lm('es', 3), 'es'))
# print("eval(ngram_false, 'es') ", eval(lm('es', 3), 'en'))
# print("eval(ngram_false, 'es') ", eval(lm('es', 3), 'fr'))
# print("eval(ngram_false, 'es') ", eval(lm('es', 3), 'pt'))
# print("eval(ngram_false, 'es') ", eval(lm('es', 3), 'it'))




7.162437156195358

## Part 4
Implement the *match* function that calls *eval* using a specific value of *n* for every possible language pair among the languages we have data for. You should call *eval* for every language pair four times, with each call assign a different value for *n* (1-4). Each language pair is composed of the source language and the target language. Before you make the call, you need to call the *lm* function to create the language model for the source language. Then you can call *eval* with the language model and the target language. The function should return a pandas DataFrame with the following four columns: *source_lang*, *target_lang*, *n*, *perplexity*. The values for the first two columns are the two-letter language codes. The value for *n* is the *n* you use for generating the specific perplexity values which you should store in the forth column.

In [23]:
def match() -> pd.DataFrame:
  '''
  Return a DataFrame containing one line per every language pair and n_gram.
  Each line will contain the perplexity calculated when applying the language model
  of the source language on the text of the target language.
  :return: a DataFrame containing the perplexity values
  '''

  languages = ['en', 'es', 'fr', 'in', 'it', 'nl', 'pt', 'tl']
  n_grams = [1, 2, 3, 4]
  # languages = languages[:2]
  # n_grams = n_grams[:2]

  data = []
  for source_lang in languages:
    for n in n_grams:
      model = lm(source_lang, n)
      for target_lang in languages:
        print(source_lang, n, target_lang)
        data.append([source_lang, target_lang, n, eval(model, target_lang)])

  return pd.DataFrame(data, columns=['source', 'target', 'n', 'perplexity'])

match1 = match()

IndexError: string index out of range

In [None]:
# match1.to_csv('match1.csv', index=False)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(match1.loc[match1['n_gram'] == 3].sort_values(by='perplexity', ascending=False))


## Part 5
Implement the *generate* function which takes a language code, *n*, the prompt (the starting text), the number of tokens to generate, and *r*, which is the random seed for any randomized action you plan to take in your implementation. The function should start generating tokens, one by one, using the language model of the given source language and *n*. The prompt should be used as a starting point for aligning on the probabilities to be used for generating the next token.

Note - The generation of the next token should be from the LM's distribution.

In [None]:
def generate(lang: str, n: int, prompt: str, number_of_tokens: int, r: int) -> str:
  '''
  Generate text in the given language using the given parameters.
  :param lang: the language of the model
  :param n: the n_gram value
  :param prompt: the prompt to start the generation
  :param number_of_tokens: the number of tokens to generate
  :param r: the random seed to use
  '''
  model = lm(lang, n)
  length = len(prompt)
  print(prompt)
  for i in range(number_of_tokens):
    print(i)
    ngram = prompt[-n+1:]
    print(ngram)
    # print(model[ngram])
    prompt += random_from_distribution_dict(model[ngram])
    # word = np.random.choice(list(model[ngram].keys()), 1, 
                # list(model[ngram].values()))
    # print(word)
    # print(type(str(word[0])))
    # prompt += str(word[0])
    print(prompt)

  return prompt

def random_from_distribution_dict(dct):
    rand_val = np.random.rand()
    total = 0
    for k, v in dct.items():
        total += v
        if rand_val <= total:
            return k
    assert False, 'unreachable'



prompt = 'how are you friend?'
len(generate('en', 5, prompt, 200, 1))

In [None]:
r = 42
np.random.seed(r)

print(np.random.rand())
np.random.rand()

## Part 6
Play with your generate function, try to generate different texts in different language and various values of *n*. No need to submit anything of that.

In [None]:
# print("eval(ngrams, 'en') ", eval(lm('en', 4), 'en'))
# print(len(lm('en', 3)))
print(match1.shape)
# get results for the source language 'en' and n_gram 1
print(match1.loc[(match1['source_lang'] == 'en') & (match1['source_lang'] == 'en') & (match1['n_gram'] == 1)].sort_values(by='perplexity'))
print(match1.loc[(match1['source_lang'] == 'tl') & (match1['source_lang'] == 'tl') & (match1['n_gram'] == 1)].sort_values(by='perplexity'))
print(match1.loc[(match1['source_lang'] == 'tl') & (match1['source_lang'] == 'nl') & (match1['n_gram'] == 4)].sort_values(by='perplexity'))
print(match1.loc[(match1['source_lang'] == 'tl')].sort_values(by='perplexity'))




# Testing

Copy the content of the **tests.py** file from the repo and paste below. This will create the results.json file and download it to your machine.

In [None]:
####################
# PLACE TESTS HERE #

# Create tests
def test_preprocess():
    return {
        'vocab_length': len(preprocess()),
    }

def test_lm():
    return {
        'english_2_gram_length': len(lm('en', 2)),
        'english_3_gram_length': len(lm('en', 3)),
        'french_3_gram_length': len(lm('fr', 3)),
        'spanish_3_gram_length': len(lm('es', 3)),
    }

def test_eval():
    return {
        'english_on_english': round(eval(lm('en', 3), 'en'), 2),
        'english_on_french': round(eval(lm('en', 3), 'fr'), 2),
        'english_on_spanish': round(eval(lm('en', 3), 'es'), 2),
    }

def test_match():
    df = match()
    return {
        'df_shape': df.shape,
        'en_en_1': df[(df['source'] == 'en') & (df['target'] == 'en') & (df['n'] == 1)]['perplexity'].values[0],
        'tl_tl_1': df[(df['source'] == 'tl') & (df['target'] == 'tl') & (df['n'] == 1)]['perplexity'].values[0],
        'tl_nl_4': df[(df['source'] == 'tl') & (df['target'] == 'nl') & (df['n'] == 4)]['perplexity'].values[0],
    }

def test_generate():
    return {
        'english_2_gram': generate('en', 2, "I am", 20, 5),
        'english_3_gram': generate('en', 3, "I am", 20, 5),
        'english_4_gram': generate('en', 4, "I Love", 20, 5),
        'spanish_2_gram': generate('es', 2, "Soy", 20, 5),
        'spanish_3_gram': generate('es', 3, "Soy", 20, 5),
        'french_2_gram': generate('fr', 2, "Je suis", 20, 5),
        'french_3_gram': generate('fr', 3, "Je suis", 20, 5),
    }

TESTS = [test_preprocess, test_lm, test_eval, test_match, test_generate]

# Run tests and save results
res = {}
for test in TESTS:
    print(f'Running test: {test.__name__}')
    try:
        cur_res = test()
        res.update({test.__name__: cur_res})
    except Exception as e:
        res.update({test.__name__: repr(e)})


# save the result file as results.json
with open('results_test.json', 'w') as f:
    json.dump(res, f, indent=2)


# with open('results.json', 'w') as f:
#     json.dump(res, f, indent=2)

# Download the results.json file
# files.download('results.json')

####################

In [None]:
# Show the local files, results.json should be there now and
# also downloaded to your local machine
!ls -l