In [4]:
# io
import os
import re

# sentence tokenization
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')
import spacy

# huggingface
from transformers import pipeline


[nltk_data] Downloading package punkt to /home/hp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
INPUT_DIR = 'part3-text'
OUTPUT_DIR = 'part3-text'

In [6]:
def get_score(model):
    print(model)
    pipe = pipeline("summarization", model=model, device=0)
    # get number of samples
    sample_fn = os.listdir(INPUT_DIR)
    n_samples = len(sample_fn)//2

    # get filenames
    inp_fn = [os.path.join(INPUT_DIR, f'inp{i}.txt') for i in range(1,n_samples+1)]
    ref_fn = [os.path.join(INPUT_DIR, f'ref{i}.txt') for i in range(1,n_samples+1)]
    print('input text filenames:', inp_fn)
    print('reference text filenames:', ref_fn)

    # load in texts
    def clean_context(filename):
        with open(filename, 'r', encoding="utf8") as f:
            text = f.read()
        text = re.sub("\n", r' ', text)
        text = re.sub(r"\s{2,}", r' ', text)
        text = re.sub(r"“|”", r'"', text)
        text = re.sub(r"‘|’", r"'", text)
        text = re.sub(r"_", r'', text, re.ASCII)
        text = re.sub(r"\s{2,}", r' ', text)
        text = text.strip()
        return text
    if 'gpt' in model.lower():
        inp_text = [clean_context(fn)+"\nTL;DR:\n" for fn in inp_fn]
    else:
        inp_text = [clean_context(fn) for fn in inp_fn]
    ref_text = ["\n".join(sent_tokenize(clean_context(fn))) for fn in ref_fn]

    query_lens = [len(q) for q in inp_text]
    predictions = []
    for i, inp in enumerate(inp_text):
        print(f'Summarizing Input {i}')
        if 'gpt' in model.lower():
            predictions.append(pipe(inp, max_length=512, clean_up_tokenization_spaces=True))
        else:
            predictions.append(pipe(inp))
    print("Done")

    def clean_pred(preds, query_lens):
        out = []
        for p, n in zip(preds, query_lens):
            for x in p:
                if 'pegasus' in model.lower():
                    out.append(x['summary_text'].replace(" .<n>", ".\n"))
                elif 'gpt' in model.lower():
                    out.append('\n'.join(sent_tokenize(x['generated_text'][n:])))
                else:
                    out.append('\n'.join(sent_tokenize(x['summary_text'])))
            print(out[-1])
        return out
    predictions = clean_pred(predictions, query_lens)


    def eval_rouge(predictions, references):
        import evaluate
        rouge = evaluate.load('rouge')
        results = rouge.compute(
            predictions=predictions,
            references=references
        )
        return results
    res = eval_rouge(predictions, ref_text)
    del pipe
    return res
# del pipe

models = ['gpt2-xl', 't5-large', 'facebook/bart-large-cnn', 'google/pegasus-cnn_dailymail']
scores = {}
for m in models:
    scores[m] = get_score(m)

gpt2-xl


Downloading:   0%|          | 0.00/6.23G [00:00<?, ?B/s]

2022-12-03 20:31:14.714773: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-03 20:31:14.715352: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-03 20:31:14.715469: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-12-03 20:31:14.715515: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA

ResourceExhaustedError: {{function_node __wrapped__AssignVariableOp_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[50257,1600] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:AssignVariableOp]