In [23]:
!pip install sentence_transformers
!pip install rouge_score


Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 3.0 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 8.6 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.11.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 19.5 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 49.6 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 507 kB/s 
Collecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x8

In [24]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import json
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import torch
import math

In [25]:

def computeSimilarity(originalTextList, paraphraseTextList):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    #model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

    outputScores = []
    originalSentEmbeddings = model.encode(originalTextList)
    paraphraseSentEmbeddings = model.encode(paraphraseTextList)
    for originalSent, paraphraseSent in zip(originalSentEmbeddings, paraphraseSentEmbeddings):
        outputScores.append(cosine_similarity([originalSent], [paraphraseSent]).tolist()[0][0])

    return outputScores


def computeBLEU(originalTextList, paraphraseTextList):
    outputScores = []
    for originalSent, paraphraseSent in zip(originalTextList, paraphraseTextList):
        originalTokens = originalSent.split(' ')
        paraphraseTokens = paraphraseSent.split(' ')
        outputScores.append(sentence_bleu([originalTokens], paraphraseTokens, weights=(1, 0, 0, 0)))

    return outputScores


def computeROUGE(originalTextList, paraphraseTextList):
    outputScores = []
    for originalSent, paraphraseSent in zip(originalTextList, paraphraseTextList):
        scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=False)
        scores = scorer.score(originalSent, paraphraseSent)
        outputScores.append(scores['rougeL'][0])

    return outputScores


def completeEvaluate(originalTextFilename, generatedTextFilename):
    originalList = []
    paraphraseList = []
    originalFile = open(originalTextFilename, 'r')
    generatedFile = open(generatedTextFilename, 'r')
    for originalLine, generatedLine in zip(originalFile, generatedFile):
        if '<ERROR>' not in generatedLine:
            paraphraseList.append(generatedLine.strip())
            originalList.append(originalLine.strip())
    originalFile.close()
    generatedFile.close()

    print(len(originalList), len(paraphraseList))
    BLEUScores = computeBLEU(originalList, paraphraseList)
    ROUGEScores = computeROUGE(originalList, paraphraseList)
    SimilarityScores = computeSimilarity(originalList, paraphraseList)
    print('Average BLEU: ' + str(sum(BLEUScores) / len(BLEUScores)))
    print('Average ROUGE: ' + str(sum(ROUGEScores) / len(ROUGEScores)))
    print('Average Similarity: ' + str(sum(SimilarityScores) / len(SimilarityScores)))

    # with open(reportFilename, 'w') as scoreReportFile:
    #     for bScore, rScore, sScore in zip(BLEUScores, ROUGEScores, SimilarityScores):
    #         scoreReportFile.write(str(bScore) + '\t' + str(rScore) + '\t' + str(sScore) + '\n')

    print('DONE')


def verifyKeyComponents(generatedFilename, itemFilename, reportFilename):
    generatedFile = open(generatedFilename, 'r')
    itemFile = open(itemFilename, 'r')
    containedCount = 0
    totalCount = 0
    with open(reportFilename, 'w') as reportFile:
        for generatedLine, itemLine in zip(generatedFile, itemFile):
            if '<ERROR>' not in generatedLine:
                itemData = json.loads(itemLine.strip())
                itemList = itemData['<RST>'] + itemData['<EXN>']
                lineItemCount = 0
                lineTotalCount = 0
                for item in itemList:
                    lineTotalCount += 1
                    if item.lower() in generatedLine.lower():
                        lineItemCount += 1
                containedCount += (lineItemCount/lineTotalCount)
                totalCount += 1
                reportFile.write(str(lineItemCount/lineTotalCount)+'\n')
            else:
                reportFile.write(generatedLine)

    itemFile.close()
    generatedFile.close()

    print('Coverage: ' + str(containedCount/totalCount))


def generatePerplexity(generatedFilename, reportFilename):
    device = 'cuda'
    model_id = 'gpt2-medium'
    model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
    model.eval()
    tokenizer = GPT2TokenizerFast.from_pretrained(model_id)

    scoreSum = 0
    totalCount = 0
    reportFile = open(reportFilename, 'w')
    #totalData = ["ivent to the sound of a couple of fuses : 29 per cent of the fine is for the use of this."]
    with open(generatedFilename, 'r') as fi:
        for index, line in enumerate(fi):
            if index%5000 == 0:
                print(index)
            if '<ERROR>' in line:
                reportFile.write(line)
            else:
                input_sentence = line.strip()
                input_ids = torch.tensor(tokenizer.encode(input_sentence)).unsqueeze(0)
                input_ids = input_ids.to(device)
                with torch.no_grad():
                    outputs = model(input_ids, labels=input_ids)
                loss, logits = outputs[:2]
                ppl = math.exp(loss)
                scoreSum += ppl
                totalCount += 1
                reportFile.write(str(ppl)+'\n')

    reportFile.close()
    print(scoreSum/totalCount)
    print(index)
    print('DONE')
    return None





In [None]:
if __name__ == '__main__':
    
    # completeEvaluate('drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/commTweets.content',
    #                  'drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/results/commTweets.full.copynet',
    #                  'drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/reports/commTweets.full.copynet.performance',
    #                  repeatTimes=3, split=True)

    # verifyKeyComponents('drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/results/commTweets.full.copynet',
    #                     'drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/commTweets.item',
    #                     'drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/reports/commTweets.full.copynet.verify',
    #                     repeatTimes=3)

    # generatePerplexity('drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/results/commTweets.full.copynet',
    #                    'drive/My Drive/Cui_workspace/Data/TweetParaphrase/commTweets/test_single/reports/commTweets.full.copynet.perplexity')
    list_a = ['Quang is doing the assignment for the NLP class',
              'Quang is writting the essay for the informatic class']
    list_b = ['Quang have done the assignment for the NLP class',
              'Quang complete the essay for the informatic class']
    print(computeSimilarity(list_a,list_b))

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

[0.9271193146705627, 0.9264665842056274]


In [5]:
import os
os.chdir('/content/drive/MyDrive/VIN_NLP/ThanhQuang_NLP')

In [6]:
import pandas as pd

In [12]:
eval_t5_out_domain = pd.read_csv('eval_T5_in_domain.csv')

In [14]:
eval_t5_out_domain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   src              100 non-null    object
 1   paraphrase       100 non-null    object
 2   more_paraphrase  100 non-null    object
dtypes: object(3)
memory usage: 2.5+ KB


In [9]:
eval_t5_out_domain = eval_t5_out_domain.dropna()

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
eval_t5_out_domain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 498 entries, 0 to 499
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   src              498 non-null    object
 1   paraphrase       498 non-null    object
 2   more_paraphrase  498 non-null    object
dtypes: object(3)
memory usage: 15.6+ KB


In [19]:
ground_truth = eval_t5_out_domain['src'].tolist()
paraphrase = eval_t5_out_domain['paraphrase'].tolist()


In [26]:
cos_simi = computeSimilarity(ground_truth,paraphrase)
print(sum(cos_simi)/len(cos_simi))



Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

0.8721898746490478


In [27]:
bleu = computeBLEU(ground_truth,paraphrase)
print(sum(bleu)/len(bleu))

0.3649771382442363


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


In [28]:
rouge = computeROUGE(ground_truth,paraphrase)
print(sum(rouge)/len(rouge))

0.5500886674656491
