In [15]:
import json
import os
import pathlib
import pandas as pd
from glob import glob
import rouge
import nltk
import matplotlib.pyplot as plt
from scipy import stats


def get_human_summary(summary_path):
    try:
        with open("../../booksum/scripts/" + summary_path, encoding='utf-8') as f:
            summary_json = json.load(f)
            return summary_json["summary"]
    except Exception as e:
        print("Failed to read summary file: {}".format(e))
        return None


def calculate_F1():
    summaries_count = 0
    data = []
    used_files = []
    unique_books = set()
    unique_used_books = set()

    human_summaries = dict()

    f = open(pathlib.Path("../../booksum/alignments/chapter-level-summary-alignments/chapter_summary_aligned_all_split.jsonl"),
             encoding='utf-8')

    for line in f:
        content = json.loads(line)
        if content['source'] == 'pinkmonkey':
            continue
        text = get_human_summary(content['summary_path'])
        if text is not None:
            human_summaries[content['summary_path']] = {
                "chapter_title": content['book_id'],
                "source": content['source'],
                "summary_text": text,
            }

    print("Evaluating {} summary documents...".format(len(human_summaries)))

    for summary_path, summary in human_summaries.items():

        # Get all related summary documents.
        unique_books.add(summary['chapter_title'])
        # Special case for Around the World in Eighty (80) Days
        if summary['chapter_title'] == "Around the World in Eighty Days":
            related_summaries = list(filter(
                lambda curr_summary: curr_summary['chapter_title'] == 'Around the World in 80 Days', human_summaries.values()))

        elif summary['chapter_title'] == "Around the World in 80 Days":
            related_summaries = list(filter(
                lambda curr_summary: curr_summary['chapter_title'] == 'Around the World in Eighty Days', human_summaries.values()))

        else:
            related_summaries = list(filter(lambda curr_summary: curr_summary['chapter_title'] == summary[
                                     'chapter_title'] and curr_summary['source'] != summary['source'], human_summaries.values()))
        # Remember which files have been used.
        used_files.extend(related_summaries)
        # print(summary['chapter_title'], summary['source'])
        # print(related_summaries)

        # if there are no related summary documents, then just print.
        if len(related_summaries) == 0:
            print("No related summary documents were found for {}.".format(
                summary['chapter_title']))
            continue

        # # Run the ROUGE command using the current summary as the reference and the related summaries as hypotheses.
        # # Print the scores and save them.
        related_summary_texts = [curr_summary['summary_text']
                                 for curr_summary in related_summaries]

        evaluator = rouge.Rouge(metrics=['rouge-n'],
                                max_n=1,
                                limit_length=False)
        scores = evaluator.get_scores(
            summary['summary_text'], related_summary_texts)

        # print(scores['rouge-1'])
        data.append([scores['rouge-1']['f'], summary['chapter_title'], summary['source']])
        unique_used_books.add(summary['chapter_title'])
        summaries_count += 1
    return data, summaries_count, unique_books, unique_used_books


In [16]:
nltk.download('punkt')
data, summaries_count, unique_chapters, unique_used_chapters = calculate_F1()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jordan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Failed to read summary file: [Errno 2] No such file or directory: '../../booksum_og/scripts/finished_summaries/novelguide/Lord Jim/section_16_part_0.txt'
Failed to read summary file: [Errno 2] No such file or directory: '../../booksum_og/scripts/finished_summaries/novelguide/Main Street/section_3_part_1.txt'
Failed to read summary file: [Errno 2] No such file or directory: '../../booksum_og/scripts/finished_summaries/novelguide/Main Street/section_3_part_2.txt'
Failed to read summary file: [Errno 2] No such file or directory: '../../booksum_og/scripts/finished_summaries/novelguide/Main Street/section_3_part_3.txt'
Failed to read summary file: [Errno 2] No such file or directory: '../../booksum_og/scripts/finished_summaries/novelguide/Main Street/section_3_part_4.txt'
Failed to read summary file: [Errno 2] No such file or directory: '../../booksum_og/scripts/finished_summaries/novelguide/Main Street/section_3_part_5.txt'
Failed to read summary file: [Errno 2] No such file or directory: 

In [17]:

print("Unique chapters covered: {}".format(len(unique_chapters)))
print("Unique chapters used: {}".format(len(unique_used_chapters)))
ROUGE_list = [data_item[0] for data_item in data]
ROUGE_mean = sum(ROUGE_list) / len(ROUGE_list)
print("Mean ROUGE-1 F1: {}".format(ROUGE_mean))
print()

# # Comment these out to avoid saving the csv files.
df = pd.DataFrame(data, columns=["ROUGE-1 F1", "chapter-title", "source"])
# Save file.
df.to_csv("../csv_results/booksum_summaries/chapter-level-sum-comparison-results.csv")


Unique chapters covered: 7278
Unique chapters used: 2326
Mean ROUGE-1 F1: 0.4087415008740161



In [90]:
import json
import os
import pathlib
import pandas as pd
from glob import glob
import matplotlib.pyplot as plt
from scipy import stats


def calculate_book_F1():
    summaries_count2 = 0
    book_data = []
    used_files = []
    unique_books = set()
    unique_used_books = set()

    book_summaries = dict()

    f = open(pathlib.Path("../../booksum/alignments/book-level-summary-alignments/book_summaries_aligned_all.jsonl"),
             encoding='utf-8')

    for line in f:
        content = json.loads(line)
        if content['source'] == 'pinkmonkey':
            continue
        book_summaries[content['summary_path']] = {
            "title": content['title'],
            "source": content['source']
        }

    chapter_data = pd.read_csv(
        "../csv_results/booksum_summaries/chapter-level-sum-comparison-results.csv")

    print("Evaluating {} summary documents...".format(len(chapter_data)))

    for summary_path, summary in book_summaries.items():
        # Get all related summary documents.
        unique_books.add(summary['title'])
        print(summary['title'])
        print(chapter_data['chapter-title'].str.split("."))
        filter_ = (chapter_data['chapter-title'].str.split(".") == summary['title'] ) & (chapter_data['source'] != summary['source'])
        related_summaries = chapter_data[filter_]
        # Remember which files have been used.
        used_files.extend(related_summaries)
        # print(summary['chapter_title'], summary['source'])
        print(related_summaries)

        # if there are no related summary documents, then just print.
        if len(related_summaries) == 0:
            print("No related summary documents were found for {}.".format(
                summary['title']))
            continue

        # # Run the ROUGE command using the current summary as the reference and the related summaries as hypotheses.
        # # Print the scores and save them.
        book_mean = sum(related_summaries['ROUGE-1 F1']) / len(related_summaries)

        print(book_mean)

        # print(scores['rouge-1'])
        book_data.append([book_mean, summary_path])
        unique_used_books.add(summary['title'])
        summaries_count2 += 1
    return book_data, summaries_count2, unique_books, unique_used_books


In [91]:
book_data, book_summaries_count, unique_books2, unique_used_books2 = calculate_book_F1()


Evaluating 6367 summary documents...
The Brothers Karamazov
0                [The Prince, chapter x]
1               [The Prince, chapter xi]
2               [The Prince, chapter xv]
3              [The Prince, chapter xvi]
4             [The Prince, chapter xvii]
                      ...               
6362    [As You Like It, act 4, scene 3]
6363    [As You Like It, act 5, scene 1]
6364    [As You Like It, act 5, scene 2]
6365    [As You Like It, act 5, scene 3]
6366    [As You Like It, act 5, scene 4]
Name: chapter-title, Length: 6367, dtype: object
Empty DataFrame
Columns: [Unnamed: 0, ROUGE-1 F1, chapter-title, source]
Index: []
No related summary documents were found for The Brothers Karamazov.
The Brothers Karamazov
0                [The Prince, chapter x]
1               [The Prince, chapter xi]
2               [The Prince, chapter xv]
3              [The Prince, chapter xvi]
4             [The Prince, chapter xvii]
                      ...               
6362    [As You Like

In [74]:

print("Unique books covered: {}".format(len(unique_books2)))
print("Unique books used: {}".format(len(unique_used_books2)))
print(book_data)
ROUGE_list = [data_item[0] for data_item in book_data]

ROUGE_mean = sum(ROUGE_list) / len(ROUGE_list)

Unique books covered: 189
Unique books used: 0
[]


ZeroDivisionError: division by zero