# SI7003 NLP - SI7016 Applied NLP
# Language Model n-gram
# KenLM
# this notebook can run on google colab
# example with big datasets
# Tatoeba Corpus - short sentences in several languages (eng, spa, fre, ita, etc)
## https://downloads.tatoeba.org/exports/sentences.tar.bz2
## https://downloads.tatoeba.org/exports/links.tar.bz2

## challences:
## 1. best sentence structured among several combinatios: improve redactions, writing, language(english) learning
## 2. grammar correction (I has an dreem -> I have a dream)
## 3. classification (n models (classes), one new document, what class?)
## 4. language detection
## 5. word prediction or autocomplete

In [None]:
# Instalar KenLM en la máquina local, tiene que ser un ubuntu o leer las instrucciones especificas para Mac o Windows
# en caso de no tener nativamente linux, puede instalar docker y correr ubuntu.
# se adjuntan los Dockerfile y docker-compose para esto
#
# este demo tambien corre en google colab
#
!apt-get install -y build-essential cmake libboost-all-dev zlib1g-dev
!git clone https://github.com/kpu/kenlm.git
!cd kenlm && mkdir build && cd build && cmake .. && make -j4

In [None]:
!wget https://downloads.tatoeba.org/exports/links.tar.bz2
!tar -xjf links.tar.bz2

In [None]:
!wget https://downloads.tatoeba.org/exports/sentences.tar.bz2
!tar -xjf sentences.tar.bz2


In [None]:
!grep -P "\tspa\t" sentences.csv > tatoeba_spa.tsv
!cut -f3 tatoeba_spa.tsv > corpus_spa.txt
!grep -P "\teng\t" sentences.csv > tatoeba_eng.tsv
!cut -f3 tatoeba_eng.tsv > corpus_eng.txt

In [None]:
!cat corpus_spa.txt | \
  tr '[:upper:]' '[:lower:]' | \
  sed 's/[^a-záéíóúüñç¿¡ ]//g' > corpus_clean_spa.txt

!cat corpus_eng.txt | \
  tr '[:upper:]' '[:lower:]' | \
  sed 's/[^a-záéíóúüñç¿¡ ]//g' > corpus_clean_eng.txt


In [None]:
!grep -P "\teng\t" sentences.csv | cut -f3 | \
tr '[:upper:]' '[:lower:]' | tr -d '[:punct:]' | \
tr ' ' '\n' | sort | uniq -c | sort -nr > vocab_eng.txt

In [None]:
!grep -P "\spa\t" sentences.csv | cut -f3 | \
tr '[:upper:]' '[:lower:]' | tr -d '[:punct:]' | \
tr ' ' '\n' | sort | uniq -c | sort -nr > vocab_spa.txt

In [None]:
# Entrenar modelo n-gram (n=3))
!kenlm/build/bin/lmplz -o 5 < corpus_clean_spa.txt > tatoeba_spa.arpa

In [None]:
# Entrenar modelo n-gram (n=5))
!kenlm/build/bin/lmplz -o 5 < corpus_clean_spa.txt > tatoeba_spa.arpa

In [None]:
!kenlm/build/bin/lmplz -o 5 < corpus_clean_eng.txt > tatoeba_eng.arpa

In [None]:
# Convertir el modelo ARPA a binario (opcional pero eficiente)
!kenlm/build/bin/build_binary tatoeba_spa.arpa tatoeba_spa.binary

In [None]:
!kenlm/build/bin/build_binary tatoeba_eng.arpa tatoeba_eng.binary

In [None]:
# Instalar paquete python kenlm
!pip3 install kenlm

In [None]:
import kenlm

model_eng = kenlm.Model('../../tatoeba_eng.bin')
model_spa = kenlm.Model('../../tatoeba_spa.bin')

sentence = "I have a dream"
print("Log probability:", model_eng.score(sentence))
print("Perplexity:", model_eng.perplexity(sentence))

print("Log probability:", model_spa.score(sentence))
print("Perplexity:", model_spa.perplexity(sentence))

In [None]:
# Comparar múltiples frases
# Comparar múltiples frases
sentences = [
    "she be a dream",
    "she are an dream",
    "she is a dream"
]

for s in sentences:
    print(f"{s}\n  LogProb: {model_eng.score(s):.2f}  Perplexity: {model_eng.perplexity(s):.2f}\n")

In [None]:
# Consultar el modelo desde Python - spanish model, compare the results with english model
import kenlm

model = kenlm.Model('tatoeba_spa.binary')

sentence = "I am John"
print("Log probability:", model.score(sentence))
print("Perplexity:", model.perplexity(sentence))