<a href="https://colab.research.google.com/github/kyunghyuncho/ammi-2019-nlp/blob/master/01-day-LM/ken_lm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KenLM Framework for Language Modeling


## Install KenLM

#### git clone https://github.com/vchahun/kenlm.git
#### pushd kenlm
#### ./bjam
#### python setup.py install
#### popd

### OBS: if we want to use n-gram > 6, we need to change -DKENLM_MAX_ORDER in ARGS in setup.py before doing python setup.py install. Change to whatever is the max you need. We will use 11 in this case.

#### Other Useful Links:
Download stable release and unzip: http://kheafield.com/code/kenlm.tar.gz

Need Boost >= 1.42.0 and bjam
*   Ubuntu: sudo apt-get install libboost-all-dev
*   Mac: brew install boost; brew install bjam

Run within kenlm directory:
    
*  mkdir -p build
  *  cd build
  *  cmake .. -DKENLM_MAX_ORDER=10
  *  make -j 4
 
pip install https://github.com/kpu/kenlm/archive/master.zip

For more information on KenLM see: https://github.com/kpu/kenlm and http://kheafield.com/code/kenlm/


In [1]:
import sys
sys.path.append('utils/')

In [2]:
import kenlm
import os
import re
import utils.ngram_utils as ngram_utils


In [3]:
# Read data from .txt files and create lists of reviews

train_data = []
# create a list of all the reviews 
with open('../data/amazon_train.txt', 'r') as f:
    train_data = [review for review in f.read().split('\n') if review]
    
valid_data = []
# create a list of all the reviews 
with open('../data/amazon_valid.txt', 'r') as f:
    valid_data = [review for review in f.read().split('\n') if review]
    

In [4]:
# Tokenize the Datasets
# TODO: this takes a really long time !! why?
train_data_tokenized, all_tokens_train = ngram_utils.tokenize_dataset(train_data)
valid_data_tokenized, all_tokens_valid = ngram_utils.tokenize_dataset(valid_data)


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
train_data = []
for t in train_data_tokenized:
    train_data.append(' '.join(t))
train_data[:3]

['this is a great tutu and at a really great price .',
 "it doesn ' t look cheap at all .",
 "i ' m so glad i looked on amazon and found such an affordable tutu that isn ' t made poorly ."]

In [None]:
valid_data = []
for t in valid_data_tokenized:
    valid_data.append(' '.join(t))
valid_data[:3]

['these are not sized right .',
 'a 3x is always big on me and these r cut wrong !',
 "i ' m returning them ."]

In [None]:
len(train_data), len(valid_data)

(107790, 15172)

In [None]:
# Change directory where you have the data
path = '/home/roberta/ammi-2019-nlp/data/'
os.chdir(path)


## 3-gram model with KenLM

In [None]:
cat train.txt | /home/roberta/kenlm/bin/lmplz -o 3 > amazonLM3.arpa

=== 1/5 Counting and sorting n-grams ===
File stdin isn't normal.  Using slower read() instead of mmap().  No progress bar.
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:860352 2:75230912512 3:141057966080
Statistics:
1 71696 D1=0.690098 D2=0.962667 D3+=1.22676
2 1239185 D1=0.712943 D2=1.05296 D3+=1.36242
3 4834597 D1=0.772513 D2=1.0869 D3+=1.33918
Memory estimate for binary LM:
type     MB
probing 113 assuming -p 1.5
probing 120 assuming -r models -p 1.5
trie     44 without quantization
trie     24 assuming -q 8 -b 8 quantization 
trie     42 assuming -a 22 array pointer compression
trie     22 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:860352 2:19826960 3:96691940
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
####################################################################################################
=== 

In [None]:
!/home/roberta/kenlm/bin/build_binary amazonLM3.arpa amazonLM3.klm

Reading amazonLM3.arpa
----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
****************************************************************************************************
SUCCESS


In [None]:
model_3n = kenlm.LanguageModel('amazonLM3.klm')


## 5-gram KenLM

In [None]:
cat train.txt | /home/roberta/kenlm/bin/lmplz -o 5 > amazonLM5.arpa

=== 1/5 Counting and sorting n-grams ===
File stdin isn't normal.  Using slower read() instead of mmap().  No progress bar.
=== 2/5 Calculating and sorting adjusted counts ===
Chain sizes: 1:860352 2:21101352960 3:39565037568 4:63304056832 5:92318425088
Statistics:
1 71696 D1=0.690098 D2=0.962667 D3+=1.22676
2 1239185 D1=0.712943 D2=1.05296 D3+=1.36242
3 4834597 D1=0.796199 D2=1.09701 D3+=1.35908
4 9215190 D1=0.868874 D2=1.16401 D3+=1.3733
5 12376562 D1=0.898907 D2=1.2197 D3+=1.36975
Memory estimate for binary LM:
type     MB
probing 564 assuming -p 1.5
probing 651 assuming -r models -p 1.5
trie    261 without quantization
trie    142 assuming -q 8 -b 8 quantization 
trie    232 assuming -a 22 array pointer compression
trie    112 assuming -a 22 -q 8 -b 8 array pointer compression and quantization
=== 3/5 Calculating and sorting initial probabilities ===
Chain sizes: 1:860352 2:19826960 3:96691940 4:221164560 5:346543736
----5---10---15---20---25---30---35---40---45---50---55---60---65

In [None]:
!/home/roberta/kenlm/bin/build_binary amazonLM5.arpa amazonLM5.klm


In [None]:
model_5n = kenlm.LanguageModel('amazonLM5.klm')

## 7-gram KenLM


In [None]:
cat train.txt | /home/roberta/kenlm/bin/lmplz -o 7 > amazonLM7.arpa


In [None]:
!/home/roberta/kenlm/bin/build_binary amazonLM7.arpa amazonLM7.klm


In [None]:
model_7n = kenlm.LanguageModel('amazonLM7.klm')


## 10-gram KenLM


In [None]:
cat train.txt | /home/roberta/kenlm/bin/lmplz -o 10 > amazonLM10.arpa


In [None]:
!/home/roberta/kenlm/bin/build_binary amazonLM10.arpa amazonLM10.klm


In [None]:
model_10n = kenlm.LanguageModel('amazonLM10.klm')


### The KenLM model reports negative log likelihood, not perplexity. So we'll be converting the score and report net perplexity. The following function calculate the perpelxity.

### Pereplexity is defined as follows, $$ PPL = b^{- \frac{1}{N} \sum_{i=1}^N \log_b q(x_i)} $$ 

### All probabilities here are in log base 10 so to convert to perplexity, we do the following 

### $$PPL = 10^{-\log(P) / N} $$ 

### where $P$ is the total NLL, and $N$ is the word count.

In [None]:
def get_ppl(lm, sentences):
    """
    Assume sentences is a list of strings (space delimited sentences)
    """
    total_nll = 0
    total_wc = 0
    for sent in sentences:
        sent = re.sub(r"([\w/'+$\s-]+|[^\w/'+$\s-]+)\s*", r"\1 ", sent)
        words = sent.strip().split()
        score = lm.score(sent, bos=False, eos=False)
        word_count = len(words)
        total_wc += word_count
        total_nll += score
    ppl = 10**-(total_nll/total_wc)
    return ppl


In [None]:
# 3-gram
train_ppl = get_ppl(model_3n, train_data)
valid_ppl = get_ppl(model_3n, valid_data)
train_ppl, valid_ppl

In [None]:
# 5-gram
train_ppl = get_ppl(model_5n, train_data)
valid_ppl = get_ppl(model_5n, valid_data)
train_ppl, valid_ppl

In [None]:
# 7-gram
train_ppl = get_ppl(model_7n, train_data)
valid_ppl = get_ppl(model_7n, valid_data)
train_ppl, valid_ppl

In [None]:
# 10-gram
train_ppl = get_ppl(model_10n, train_data)
valid_ppl = get_ppl(model_10n, valid_data)
train_ppl, valid_ppl

### Score Sentences

In [None]:
sentences = ['i like this product very much .']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentences = ['i like pandas']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

Function for loading the data

In [None]:
sentences = ['this color is very ugly]
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentences = ['kigali is an awesome city !']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentences = ['i want to get a refund']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentences = ['this watch is not what i expected']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentences = ['this dress fits me perfectly !']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10

In [None]:
sentences = ['my wife loves this ring']
ppl3 = get_ppl(model_3n, sentences)
ppl5 = get_ppl(model_5n, sentences)
ppl7 = get_ppl(model_7n, sentences)
ppl10 = get_ppl(model_10n, sentences)
ppl3, ppl5, ppl7, ppl10