In [1]:
import torch
import numpy as np
import datasets
import os
import umap
import evaluate
from pathlib import Path
from itertools import product
from IPython.core.debugger import set_trace
from datasets import Dataset, DatasetDict
from torch import nn
from sentence_transformers import SentenceTransformer
from nltk import sent_tokenize
from IPython.core.debugger import Pdb
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
from transformers import AutoModel, AutoTokenizer
from pprint import pprint

datasets.disable_caching()

# Set this to whatever you want
seed = 10

torch.manual_seed(seed)
np.random.seed(seed)

%load_ext autoreload
%autoreload 2
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
ds = Dataset.from_parquet('./data/sofsat_lora.parquet')
print(ds)

Dataset({
    features: ['S1', 'S2', 'Sy', 'operation', 'output'],
    num_rows: 11000
})


# Clean Up Output

In [3]:
def map_fn(sample):
    idx = sample['output'].index('</op2>')
    clean_output = sample['output'][idx+6:]
    sample['output_raw'] = sample['output']
    sample['output'] = clean_output
    return sample

ds = ds.map(map_fn)
print(ds)

Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11000/11000 [00:00<00:00, 21372.62 examples/s]

Dataset({
    features: ['S1', 'S2', 'Sy', 'operation', 'output', 'output_raw'],
    num_rows: 11000
})





# Compute ROUGE and BERTscore

In [4]:
scores = {}

print('computing rouge')
scorer = evaluate.load('rouge')
scores['rouge'] = scorer.compute(
    predictions=ds['output'], 
    references=ds['Sy']
)

print('computing bertscore')
scorer = evaluate.load("bertscore")
bertscore = scorer.compute(
    predictions=ds['output'], 
    references=ds['Sy'],
    lang="en",
    batch_size=100,
    device='cuda:1',
    verbose=True,
    rescale_with_baseline=True,
)
scores['bertscore'] = np.mean(bertscore['f1'])

pprint(scores)

computing rouge
computing bertscore


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 220/220 [01:05<00:00,  3.38it/s]


computing greedy matching.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 110/110 [00:02<00:00, 54.34it/s]


done in 1401153.21 seconds, 0.01 sentences/sec
{'bertscore': 0.40140151977638544,
 'rouge': {'rouge1': 0.4769846361350538,
           'rouge2': 0.26436766516974775,
           'rougeL': 0.35949005296519115,
           'rougeLsum': 0.36950015524013735}}


# Show an Example

In [45]:
idx = np.random.randint(len(ds))

sample = ds[idx]
print(f'sample: {idx}')
print(f's1: {sample["S1"]}')
print(f's2: {sample["S2"]}')
print(f'op: {sample["operation"]}')
print(f'Reference: {sample["Sy"]}')
print(f'Prediction: {sample["output"]}')

sample: 7561
s1: The Style A Hotel offers a $190 suite with a Jacuzzi and sauna, popular with young couples and others.
s2: Love hotels are not only for young couples; some, like one anonymous man, go when drunk and don't want to go home.
op: intersection
Reference: Though young couples make up the majority of customers, they are not the only ones.
Prediction: The Style A Hotel offers a suite with a Jacuzzi and sauna for couples looking for a little relaxation....


In [47]:
print(set(ds['operation']))

{'intersection', 'union'}
