# Metrics
1. [Vocabulary Size](#Vocabulary-Size)
2. [Avg/Mean/Median Text length](#Avg/Mean/Median-Text-length)

In [None]:
# Making sure the less common dependencies in this notebook are available for a user.
import sys
!{sys.executable} -m pip install sentencepiece
!{sys.executable} -m pip install ipywidgets

In [None]:
from datasets import load_dataset
import pandas as pd
import re

In [None]:
from nltk.tokenize import word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [None]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.stem import WordNetLemmatizer

In [None]:
# Used later in vocab statistics.
nltk.download('stopwords')

In [None]:
tokenizer = RegexpTokenizer(r"\w+")
wnl = WordNetLemmatizer()

In [None]:
import sentencepiece, statistics

# Calculating metrics on different datasets 

## glue-ax
A manually-curated evaluation dataset for fine-grained analysis of system performance on a broad range of linguistic phenomena. This dataset evaluates sentence understanding through Natural Language Inference (NLI) problems. Use a model trained on MulitNLI to produce predictions for this dataset.

In [None]:
dataset = load_dataset("glue", "ax")

In [None]:
rawdata= pd.DataFrame.from_dict(dataset)
axdata= pd.json_normalize(rawdata.test)

In [None]:
axdata.head()

In [None]:
axdata.label.value_counts()

Why is it only negative labels?...

## Asset

In [None]:
asset = load_dataset("asset", "ratings")

In [None]:
rawassetdata= pd.DataFrame.from_dict(asset)

In [None]:
assetdata= pd.json_normalize(rawassetdata.full)
assetdata.shape

In [None]:
assetdata.head()

In [None]:
assetdata.rating.value_counts()

## IMDB 

In [None]:
imdb = load_dataset("imdb")

In [None]:
imdb['train']

In [None]:
rawimdb= pd.DataFrame.from_dict(imdb['train'])

In [None]:
rawimdb.shape

### Label Distribution

In [None]:
rawimdb.label.value_counts()

### Pre-processing needed : removing html at the least, also removing punctuation and stopwords if needed

In [None]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

### Vocabulary Size

In [None]:
alllist=[cleanhtml(sent) for sent in rawimdb.text]
corp= ' '. join(s for s in alllist)

In [None]:
vocab = FreqDist(word.lower() for word in tokenizer.tokenize(corp))

In [None]:
print("There are " + str(len(vocab)) + " words including stop words")

In [None]:
nostopvocab = FreqDist(word.lower() for word in tokenizer.tokenize(corp) if word.lower() not in stopwords.words('english'))

In [None]:
print("There are " + str(len(nostopvocab)) + " words after removing stop words")

In [None]:
lemvocab = FreqDist(wnl.lemmatize(word.lower()) for word in tokenizer.tokenize(corp) if word.lower() not in stopwords.words('english'))

In [None]:
print("There are " + str(len(lemvocab)) + " words after removing stop words and lemmatizing")

### Avg/Mean/Median Text length

In [None]:
total_lens = 0
alllengths=[]
for i, sent in enumerate(alllist):
    lent=len(tokenizer.tokenize(sent))
    alllengths.append(lent)
    total_lens += lent

In [None]:
avg_sent_len = total_lens / i
print("The average sentence length is: " + str(round(avg_sent_len,4)) + " words.")

In [None]:
print("The mean sentence length is: " + str(statistics.mean(alllengths)) + " words.")

In [None]:
print("The median sentence length is: " + str(statistics.median(alllengths)) + " words.")

#### Count most frequent words for each label

In [None]:
poslist=[cleanhtml(sent) for sent in rawimdb.text.loc[rawimdb.label == 1]]
poscorp= ' '. join(s for s in poslist)

In [None]:
neglist=[cleanhtml(sent) for sent in rawimdb.text.loc[rawimdb.label == 0]]
negcorp= ' '. join(s for s in neglist)

In [None]:
positive = FreqDist(word.lower() for word in tokenizer.tokenize(poscorp) if word.lower() not in stopwords.words('english'))

In [None]:
positive.most_common(10)

In [None]:
posword=[]
for p in positive:
    posword.append(p.split(',')[0])
posword=posword[:10000]

In [None]:
negword=[]
for n in negative:
    negword.append(n.split(',')[0])
negword=negword[:10000]

In [None]:
negative = FreqDist(word.lower() for word in tokenizer.tokenize(negcorp) if word.lower() not in stopwords.words('english'))

In [None]:
negative.most_common(10)

#### Words only present in the top 10,000 most common positive words

In [None]:
onlypos= [w for w in posword if w not in negword]
len(onlypos)

In [None]:
onlypos[:10]

#### Words only present in the top 10,000 most common negative words

In [None]:
onlyneg= [w for w in negword if w not in posword]
len(onlyneg)

In [None]:
onlyneg[:10]

### Perplexity based on Wikipedia 

#### using the pretrained model from CCNet https://github.com/facebookresearch/cc_net/

In [None]:
test=alllist[1]

In [None]:
sp_model = sentencepiece.SentencePieceProcessor('en.sp.model')

In [None]:
model= kenlm.Model('/home/sasha/Documents/MilaPostDoc/Python/cc_net/data/lm_sp/en.arpa.bin')

In [None]:
score=0
doc_length=0
for sentence in sent_tokenize(test):
    sentence = sp_model.encode_as_pieces(sentence)
    score += model.score(" ".join(sentence))
    doc_length += len(sentence) + 1
print("Final score: " + str(score))

In [None]:
#from  https://stackoverflow.com/questions/54941966/how-can-i-calculate-perplexity-using-nltk/55043954

train_sentences = ['an apple', 'an orange']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) 
                for sent in train_sentences]
n = 1
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(train_data, padded_vocab)

test_sentences = ['an apple', 'an ant']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) 
                for sent in test_sentences]

test_data, _ = padded_everygram_pipeline(n, tokenized_text)
for test in test_data:
    print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

test_data, _ = padded_everygram_pipeline(n, tokenized_text)

for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))