In [47]:
from datasets import load_dataset
import pandas as pd
import re

In [138]:
from nltk.tokenize import word_tokenize, RegexpTokenizer, sent_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [180]:
tokenizer = RegexpTokenizer(r"\w+")
wnl = WordNetLemmatizer()

In [179]:
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.stem import WordNetLemmatizer


In [171]:
import sentencepiece, kenlm, statistics

# Calculating metrics on different datasets 

## glue-ax
A manually-curated evaluation dataset for fine-grained analysis of system performance on a broad range of linguistic phenomena. This dataset evaluates sentence understanding through Natural Language Inference (NLI) problems. Use a model trained on MulitNLI to produce predictions for this dataset.

In [None]:
dataset = load_dataset("glue", "ax")

In [30]:
rawdata= pd.DataFrame.from_dict(dataset)
axdata= pd.json_normalize(rawdata.test)

In [37]:
axdata.head()

Unnamed: 0,premise,hypothesis,label,idx
0,The cat sat on the mat.,The cat did not sit on the mat.,-1,0
1,The cat did not sit on the mat.,The cat sat on the mat.,-1,1
2,"When you've got no snow, it's really hard to l...","When you've got snow, it's really hard to lear...",-1,2
3,"When you've got snow, it's really hard to lear...","When you've got no snow, it's really hard to l...",-1,3
4,"Out of the box, Ouya supports media apps such ...","Out of the box, Ouya doesn't support media app...",-1,4


In [38]:
axdata.label.value_counts()

-1    1104
Name: label, dtype: int64

Why is it only negative labels?...

## Asset

In [39]:
asset = load_dataset("asset", "ratings")

Downloading:   0%|          | 0.00/2.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading and preparing dataset asset/ratings (download: 3.47 MiB, generated: 1012.55 KiB, post-processed: Unknown size, total: 4.46 MiB) to /home/sasha/.cache/huggingface/datasets/asset/ratings/1.0.0/62758c1bd7c109dfcf3d963fe61bc31625ce223c45bbe0df4ad72b9f5ce4f3ae...


0 examples [00:00, ? examples/s]

Dataset asset downloaded and prepared to /home/sasha/.cache/huggingface/datasets/asset/ratings/1.0.0/62758c1bd7c109dfcf3d963fe61bc31625ce223c45bbe0df4ad72b9f5ce4f3ae. Subsequent calls will reuse this data.


In [40]:
rawassetdata= pd.DataFrame.from_dict(asset)

In [44]:
assetdata= pd.json_normalize(rawassetdata.full)
assetdata.shape

(4500, 6)

In [43]:
assetdata.head()

Unnamed: 0,original,simplification,original_sentence_id,aspect,worker_id,rating
0,"Since 2000, the recipient of the Kate Greenawa...","Since 2000, the winner of the Kate Greenaway m...",7,0,7,55
1,"Since 2000, the recipient of the Kate Greenawa...","Since 2000, the winner of the Kate Greenaway m...",7,0,5,59
2,"Since 2000, the recipient of the Kate Greenawa...","Since 2000, the winner of the Kate Greenaway m...",7,1,5,27
3,"Since 2000, the recipient of the Kate Greenawa...","Since 2000, the winner of the Kate Greenaway m...",7,1,3,100
4,"Since 2000, the recipient of the Kate Greenawa...","Since 2000, the winner of the Kate Greenaway m...",7,2,8,36


In [47]:
assetdata.rating.value_counts()

100    882
0      726
1       99
50      93
4       84
      ... 
68      11
54      10
63      10
62      10
57      10
Name: rating, Length: 101, dtype: int64

## IMDB 

In [2]:
imdb = load_dataset("imdb")

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /home/sasha/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /home/sasha/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


In [7]:
imdb['train']

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [8]:
rawimdb= pd.DataFrame.from_dict(imdb['train'])

In [10]:
rawimdb.shape

(25000, 2)

### Label Distribution

In [16]:
rawimdb.label.value_counts()

1    12500
0    12500
Name: label, dtype: int64

### Pre-processing needed : removing html at the least, also removing punctuation and stopwords if needed

In [48]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

### Vocabulary Size

In [94]:
alllist=[cleanhtml(sent) for sent in rawimdb.text]
corp= ' '. join(s for s in alllist)

In [95]:
vocab = FreqDist(word.lower() for word in tokenizer.tokenize(corp))

In [147]:
print("There are " + str(len(vocab)) + " words including stop words")

There are 75949 words including stop words


In [150]:
nostopvocab = FreqDist(word.lower() for word in tokenizer.tokenize(corp) if word.lower() not in stopwords.words('english'))

In [151]:
print("There are " + str(len(nostopvocab)) + " words after removing stop words")

There are 75796 words after removing stop words


In [None]:
lemvocab = FreqDist(wnl.lemmatize(word.lower()) for word in tokenizer.tokenize(corp) if word.lower() not in stopwords.words('english'))

In [None]:
print("There are " + str(len(lemvocab)) + " words after removing stop words and lemmatizing")

### Avg/Mean/Median Text length

In [173]:
total_lens = 0
alllengths=[]
for i, sent in enumerate(alllist):
    lent=len(tokenizer.tokenize(sent))
    alllengths.append(lent)
    total_lens += lent

In [178]:
avg_sent_len = total_lens / i
print("The average sentence length is: " + str(round(avg_sent_len,4)) + " words.")

The average sentence length is: 238.1554 words.


In [176]:
print("The mean sentence length is: " + str(statistics.mean(alllengths)) + " words.")

The mean sentence length is: 238.14588 words.


In [177]:
print("The median sentence length is: " + str(statistics.median(alllengths)) + " words.")

The median sentence length is: 178.0 words.


#### Count most frequent words for each label

In [49]:
poslist=[cleanhtml(sent) for sent in rawimdb.text.loc[rawimdb.label == 1]]
poscorp= ' '. join(s for s in poslist)

In [50]:
neglist=[cleanhtml(sent) for sent in rawimdb.text.loc[rawimdb.label == 0]]
negcorp= ' '. join(s for s in neglist)

In [152]:
positive = FreqDist(word.lower() for word in tokenizer.tokenize(poscorp) if word.lower() not in stopwords.words('english'))

In [153]:
positive.most_common(10)

[('film', 20933), ('movie', 19074), ('one', 13653), ('like', 9036), ('good', 7721), ('story', 6778), ('time', 6515), ('great', 6418), ('well', 6407), ('see', 6025)]

In [85]:
posword=[]
for p in positive:
    posword.append(p.split(',')[0])
posword=posword[:10000]

In [84]:
negword=[]
for n in negative:
    negword.append(n.split(',')[0])
negword=negword[:10000]

In [154]:
negative = FreqDist(word.lower() for word in tokenizer.tokenize(negcorp) if word.lower() not in stopwords.words('english'))

In [155]:
negative.most_common(10)

[('movie', 24955), ('film', 19211), ('one', 13135), ('like', 11238), ('even', 7684), ('good', 7419), ('bad', 7394), ('would', 7036), ('really', 6262), ('time', 6208)]

#### Words only present in the top 10,000 most common positive words

In [156]:
onlypos= [w for w in posword if w not in negword]
len(onlypos)

2138

In [157]:
onlypos[:10]

['matthau', 'perfection', 'astaire', 'paulie', 'felix', 'flawless', 'superbly', 'gandhi', 'mildred', 'edie']

#### Words only present in the top 10,000 most common negative words

In [158]:
onlyneg= [w for w in negword if w not in posword]
len(onlyneg)

2138

In [159]:
onlyneg[:10]

['blah', 'atrocious', 'seagal', 'mst3k', 'boll', 'wasting', 'incoherent', 'drivel', 'appalling', 'miserably']

### Perplexity based on Wikipedia 

#### using the pretrained model from CCNet https://github.com/facebookresearch/cc_net/

In [143]:
test=alllist[1]

In [109]:
sp_model = sentencepiece.SentencePieceProcessor('en.sp.model')

In [117]:
model= kenlm.Model('/home/sasha/Documents/MilaPostDoc/Python/cc_net/data/lm_sp/en.arpa.bin')

In [144]:
score=0
doc_length=0
for sentence in sent_tokenize(test):
    sentence = sp_model.encode_as_pieces(sentence)
    score += model.score(" ".join(sentence))
    doc_length += len(sentence) + 1
print("Final score: " + str(score))

Final score: -1898.3147411346436


In [101]:
#from  https://stackoverflow.com/questions/54941966/how-can-i-calculate-perplexity-using-nltk/55043954

train_sentences = ['an apple', 'an orange']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) 
                for sent in train_sentences]
n = 1
train_data, padded_vocab = padded_everygram_pipeline(n, tokenized_text)
model = MLE(n)
model.fit(train_data, padded_vocab)

test_sentences = ['an apple', 'an ant']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) 
                for sent in test_sentences]

test_data, _ = padded_everygram_pipeline(n, tokenized_text)
for test in test_data:
    print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

test_data, _ = padded_everygram_pipeline(n, tokenized_text)

for i, test in enumerate(test_data):
    print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))

MLE Estimates: [(('an', ()), 0.5), (('apple', ()), 0.25)]
MLE Estimates: [(('an', ()), 0.5), (('ant', ()), 0.0)]
PP(an apple):2.8284271247461903
PP(an ant):inf
