# Metrics
1. [Vocabulary Size](#Vocabulary-Size)
2. [Instance Characteristics](#Instance-Characteristics)
3. [Perplexity](#Perplexity-based-on-Wikipedia)

In [1]:
# Making sure the less common dependencies in this notebook are available for a user.
import sys
!{sys.executable} -m pip install sentencepiece
!{sys.executable} -m pip install ipywidgets





In [2]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import re

In [3]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.stem import WordNetLemmatizer

In [4]:
# Used later in vocab statistics.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/margaretmitchell/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/margaretmitchell/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.tokenize import word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [6]:
import sentencepiece, statistics

In [7]:
tokenizer = RegexpTokenizer(r"\w+")
wnl = WordNetLemmatizer()

# Calculating metrics on different datasets 

In [8]:
# A 'Preprocessing' step -- Preprocessing should be in its own module
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

## Dataset Characteristics

In [9]:
def print_data_basics(input_data, label_column, json_column=False, label_type='discrete'):
    df = pd.DataFrame.from_dict(input_data)
    print("* Peek at data:")
    print(df.head())
    if json_column:
        df = pd.json_normalize(df[json_column])
        print("\n* Peek at data:")
        print(df.head())
    data_shape = df.shape
    print("\nNumber of rows: %s" % data_shape[0])
    print("\nNumber of columns: %s" % data_shape[1])
    print("\n* Label distribution:")
    if label_type == "discrete":
        print(df[label_column].value_counts())
    elif label_type == "real":
        np_array = np.array(df[label_column])
        print("Min:", np_array.min())
        print("Max:", np_array.max())
        print("Mean:", np_array.mean())
        print("Variance:", np_array.var())

## Vocabulary Size

In [10]:
def print_count_vocab(input_data, lower=True, language="english"):
    # Counts the number of tokens, with or without lowercase normalization.
    tokenized_text = tokenizer.tokenize(input_data)
    language_stopwords = stopwords.words(language)
    if lower:
        vocab = FreqDist(word.lower() for word in tokenized_text)
        # Are all the stopwords in lowercase?
        filtered_vocab = FreqDist(word.lower() for word in tokenized_text if word.lower() not in language_stopwords)
        lem_vocab = FreqDist(wnl.lemmatize(word.lower()) for word in tokenized_text if word.lower() not in language_stopwords)
    else:
        vocab = FreqDist(word for word in tokenized_text)
        filtered_vocab = FreqDist(word for word in tokenized_text if word not in language_stopwords)
        lem_vocab = FreqDist(wnl.lemmatize(word for word in tokenized_text if word not in language_stopwords))
    print("There are " + str(len(vocab)) + " words including stop words")
    print("There are " + str(len(filtered_vocab)) + " words after removing stop words")
    print("There are " + str(len(lem_vocab)) + " words after removing stop words and lemmatizing")

## Instance Characteristics

In [11]:
def print_text_stats(text_list):
    # Calculates sufficient statistics for text-based instances: average, mean, medium
    total_lens = 0
    alllengths=[]
    for i, sent in enumerate(text_list):
        lent=len(tokenizer.tokenize(sent))
        alllengths.append(lent)
        total_lens += lent
    avg_sent_len = total_lens / i
    print("The average sentence length is: " + str(round(avg_sent_len,4)) + " words.")
    print("The mean sentence length is: " + str(statistics.mean(alllengths)) + " words.")
    print("The mean sentence length is: " + str(statistics.median(alllengths)) + " words.")

## Per-Label Characteristics

In [12]:
# TBD. Sasha had focused on imdb: most frequent words for each label, 
# and words only present in the top 10,000 most common positive/negative words

## Dataset: glue-ax
A manually-curated evaluation dataset for fine-grained analysis of system performance on a broad range of linguistic phenomena. This dataset evaluates sentence understanding through Natural Language Inference (NLI) problems. Use a model trained on MulitNLI to produce predictions for this dataset.

In [13]:
dataset = load_dataset("glue", "ax")

Reusing dataset glue (/Users/margaretmitchell/.cache/huggingface/datasets/glue/ax/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [14]:
print_data_basics(dataset, label_column="label", json_column="test")

* Peek at data:
                                                test
0  {'premise': 'The cat sat on the mat.', 'hypoth...
1  {'premise': 'The cat did not sit on the mat.',...
2  {'premise': 'When you've got no snow, it's rea...
3  {'premise': 'When you've got snow, it's really...
4  {'premise': 'Out of the box, Ouya supports med...

* Peek at data:
                                             premise  \
0                            The cat sat on the mat.   
1                    The cat did not sit on the mat.   
2  When you've got no snow, it's really hard to l...   
3  When you've got snow, it's really hard to lear...   
4  Out of the box, Ouya supports media apps such ...   

                                          hypothesis  label  idx  
0                    The cat did not sit on the mat.     -1    0  
1                            The cat sat on the mat.     -1    1  
2  When you've got snow, it's really hard to lear...     -1    2  
3  When you've got no snow, it's really hard

## Dataset: Asset

In [15]:
asset = load_dataset("asset", "ratings")

Reusing dataset asset (/Users/margaretmitchell/.cache/huggingface/datasets/asset/ratings/1.0.0/62758c1bd7c109dfcf3d963fe61bc31625ce223c45bbe0df4ad72b9f5ce4f3ae)


In [16]:
print_data_basics(asset, label_column="rating", json_column="full", label_type="real")

* Peek at data:
                                                full
0  {'original': 'Since 2000, the recipient of the...
1  {'original': 'Since 2000, the recipient of the...
2  {'original': 'Since 2000, the recipient of the...
3  {'original': 'Since 2000, the recipient of the...
4  {'original': 'Since 2000, the recipient of the...

* Peek at data:
                                            original  \
0  Since 2000, the recipient of the Kate Greenawa...   
1  Since 2000, the recipient of the Kate Greenawa...   
2  Since 2000, the recipient of the Kate Greenawa...   
3  Since 2000, the recipient of the Kate Greenawa...   
4  Since 2000, the recipient of the Kate Greenawa...   

                                      simplification  original_sentence_id  \
0  Since 2000, the winner of the Kate Greenaway m...                     7   
1  Since 2000, the winner of the Kate Greenaway m...                     7   
2  Since 2000, the winner of the Kate Greenaway m...                     7   


## Dataset: IMDB 

In [17]:
imdb = load_dataset("imdb")

Reusing dataset imdb (/Users/margaretmitchell/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


In [18]:
imdb_train = imdb['train']

In [19]:
print_data_basics(imdb_train, label_column="label")

* Peek at data:
                                                text  label
0  Bromwell High is a cartoon comedy. It ran at t...      1
1  Homelessness (or Houselessness as George Carli...      1
2  Brilliant over-acting by Lesley Ann Warren. Be...      1
3  This is easily the most underrated film inn th...      1
4  This is not the typical Mel Brooks film. It wa...      1

Number of rows: 25000

Number of columns: 2

* Label distribution:
1    12500
0    12500
Name: label, dtype: int64


In [20]:
# Preprocessing for IMDB
alllist = [cleanhtml(sent) for sent in imdb_train["text"]]
imdb_text = ' '. join(s for s in alllist)

In [21]:
print_count_vocab(imdb_text, True)

There are 75949 words including stop words
There are 75796 words after removing stop words
There are 68187 words after removing stop words and lemmatizing


### Perplexity based on Wikipedia 

#### using the pretrained model from CCNet https://github.com/facebookresearch/cc_net/

In [22]:
test=alllist[1]

In [23]:
sp_model = sentencepiece.SentencePieceProcessor('en.sp.model')

OSError: Not found: "en.sp.model": No such file or directory Error #2

In [None]:
# TBD. Issue with accessing kenlm.
model= kenlm.Model('/home/sasha/Documents/MilaPostDoc/Python/cc_net/data/lm_sp/en.arpa.bin')

In [None]:
score=0
doc_length=0
for sentence in sent_tokenize(test):
    sentence = sp_model.encode_as_pieces(sentence)
    score += model.score(" ".join(sentence))
    doc_length += len(sentence) + 1
print("Final score: " + str(score))

In [None]:
#from  https://stackoverflow.com/questions/54941966/how-can-i-calculate-perplexity-using-nltk/55043954

train_sentences = ['an apple', 'an orange']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) 
                for sent in train_sentences]
n = 1
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(train_data, padded_vocab)

test_sentences = ['an apple', 'an ant']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) 
                for sent in test_sentences]

test_data, _ = padded_everygram_pipeline(n, tokenized_text)
for test in test_data:
    print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

test_data, _ = padded_everygram_pipeline(n, tokenized_text)

for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))