In [4]:
from datasets import load_dataset

dataset = load_dataset("Pclanglais/Brahe-Novels")

# print(dataset.keys())

# only train dataset is available
train_dataset = dataset['train']
# validation_dataset = dataset['validation']
# test_dataset = dataset['test']

dict_keys(['train'])


In [5]:
print(f"Number of training samples: {len(train_dataset)}")
# print(f"Number of validation samples: {len(validation_dataset)}")
# print(f"Number of test samples: {len(test_dataset)}")

Number of training samples: 8226


In [15]:
!pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 1.6 MB/s eta 0:00:01
Using legacy 'setup.py install' for langdetect, since package 'wheel' is not installed.
Installing collected packages: langdetect
    Running setup.py install for langdetect ... [?25ldone
[?25hSuccessfully installed langdetect-1.0.9
You should consider upgrading via the '/Users/lynn/Desktop/thesis/graph/bin/python3 -m pip install --upgrade pip' command.[0m


In [27]:
import random
import json
from collections import Counter
from transformers import BartTokenizer
from datasets import load_dataset
from langdetect import detect


def save_to_disk(data, filename):
    """Save the data to a JSON file."""
    with open(filename, 'w') as f:
        json.dump(data, f)

def tokenize_text(tokenizer, text):
    """Tokenize the text using the BART tokenizer."""
    return tokenizer.encode(text, add_special_tokens=False)

def count_tokens(tokenizer, text):
    """Count tokens in the text using the BART tokenizer."""
    return len(tokenize_text(tokenizer, text))

def filter_long_texts(dataset, tokenizer, max_tokens):
    """Filter out examples where the text length exceeds max_tokens."""
    filtered_examples = []
    for example in dataset:
        if count_tokens(tokenizer, example['full_text']) <= max_tokens:
            filtered_examples.append(example)
    return filtered_examples

def filter_non_english(dataset):
    """Filter out non-English examples based on language detection."""
    english_examples = []
    for example in dataset:
        try:
            # Detect the language of the article
            if detect(example['full_text']) == 'en':
                english_examples.append(example)
        except:
            # Skip examples where language detection fails
            continue
    return english_examples

tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# # filter out texts longer than 1050 tokens
# filtered_dataset = filter_long_texts(train_dataset, tokenizer, 1024)
# print(f"After filtering out the long texts: {len(filtered_dataset)}")

filtered_dataset = filter_non_english(train_dataset)
print(f"After filtering out the non English texts: {len(filtered_dataset)}")


# save the selected examples to disk
save_to_disk(filtered_dataset, 'brahe.json')


After filtering out the non English texts: 4302


In [28]:
# function to discard unwanted data from analysis
def transform_data(input_file, output_file):
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    transformed_data = []
    for entry in data:
        summary = entry.get('analysis', '').split('\n')[0].replace('Summary: ', '')
        
        new_entry = {
            'id': entry.get('instruction_id', ''),
            'document': entry.get('full_text', ''),
            'summary': summary,
        }
        
        transformed_data.append(new_entry)
    
    with open(output_file, 'w') as f:
        json.dump(transformed_data, f, indent=4)

input_file = 'brahe.json'
output_file = 'transformed_brahe.json'  
transform_data(input_file, output_file)


In [6]:
import json
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict

# nltk.download('punkt')

def compute_fragments(source_text, summary_text):
    # tokenize the source document and summary
    source_tokens = word_tokenize(source_text.lower())
    summary_tokens = word_tokenize(summary_text.lower())
    
    # create token-to-index mappings for quick lookup
    source_token_indices = {token: idx for idx, token in enumerate(source_tokens)}
    
    # initialize fragment length lists
    fragment_lengths = []
    current_fragment_length = 0
    
    for token in summary_tokens:
        if token in source_token_indices:
            if current_fragment_length == 0:
                current_fragment_length = 1
            else:
                current_fragment_length += 1
        else:
            if current_fragment_length > 0:
                fragment_lengths.append(current_fragment_length)
                current_fragment_length = 0
    
    if current_fragment_length > 0:
        fragment_lengths.append(current_fragment_length)
    
    return fragment_lengths

def compute_coverage_and_density(source_text, summary_text):
    fragment_lengths = compute_fragments(source_text, summary_text)
    
    # total number of words in the summary
    summary_word_count = len(word_tokenize(summary_text.lower()))
    
    # coverage: Percentage of words in the summary that are part of an extractive fragment
    coverage = sum(fragment_lengths) / summary_word_count if summary_word_count > 0 else 0
    
    # density: Average length of the extractive fragment squared
    density = (sum(length ** 2 for length in fragment_lengths) / summary_word_count
               if summary_word_count > 0 else 0)
    
    return coverage, density

def calculate_average_coverage_and_density(json_file_path):
    coverage_list = []
    density_list = []
    
    # load the dataset from the JSON file
    with open(json_file_path, 'r') as file:
        dataset = json.load(file)
    
    # process each instance in the dataset
    for instance in dataset:
        source_text = instance.get('text', '')
        summary_text = instance.get('summary', '')

        # source_text = instance.get('article', '')
        # summary_text = instance.get('highlights', '')
        
        coverage, density = compute_coverage_and_density(source_text, summary_text)
        
        coverage_list.append(coverage)
        density_list.append(density)
    
    # calculate average coverage and density
    avg_coverage = sum(coverage_list) / len(coverage_list) if coverage_list else 0
    avg_density = sum(density_list) / len(density_list) if density_list else 0
    
    return avg_coverage, avg_density

average_coverage, average_density = calculate_average_coverage_and_density('/Users/lynn/Desktop/thesis/booksum_paragraph-level-summary-alignments/article_collections/train_article.json')
print(f"Average Coverage: {average_coverage:.2f}")
print(f"Average Density: {average_density:.2f}")


Average Coverage: 0.54
Average Density: 1.95


In [3]:
import json
from statistics import mean

def calculate_average_tokens(json_file_path):
    """
    Calculate the average number of tokens in the document and summary fields of a JSON dataset.
    
    Args:
    - json_file_path (str): Path to the JSON file containing the dataset.
    
    Returns:
    - tuple: (average_tokens_document, average_tokens_summary)
    """
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    doc_lengths = []
    sum_lengths = []
    
    for entry in data:
        # Split the document and summary into tokens
        doc_tokens = entry.get('document', '').split()
        sum_tokens = entry.get('summary', '').split()
        
        doc_lengths.append(len(doc_tokens))
        sum_lengths.append(len(sum_tokens))

    avg_tokens_document = mean(doc_lengths) if doc_lengths else 0
    avg_tokens_summary = mean(sum_lengths) if sum_lengths else 0
    
    return avg_tokens_document, avg_tokens_summary

avg_doc_tokens, avg_sum_tokens = calculate_average_tokens('transformed_brahe.json')
print(f'Average number of tokens in document: {avg_doc_tokens:.2f}')
print(f'Average number of tokens in summary: {avg_sum_tokens:.2f}')


Average number of tokens in document: 163.11
Average number of tokens in summary: 33.49


In [11]:
from sklearn.model_selection import train_test_split

# load dataset from JSON file
with open('selected_examples.json', 'r') as file:
    data = json.load(file)

total_examples = len(data)
train_size = 8000 / total_examples
test_size = 1000 / total_examples
valid_size = 1000 / total_examples

# split the data into training, validation, and test sets
train_data, remaining_data = train_test_split(data, test_size=(test_size + valid_size), random_state=42)
test_data, valid_data = train_test_split(remaining_data, test_size=(valid_size / (test_size + valid_size)), random_state=42)


In [14]:
len(valid_data)

 

1000

In [15]:
# save training data to JSON file
with open('train_article.json', 'w') as train_file:
    json.dump(train_data, train_file)

# save testing data to JSON file
with open('test_article.json', 'w') as test_file:
    json.dump(test_data, test_file)

# save validation data to JSON file
with open('valid_article.json', 'w') as valid_file:
    json.dump(valid_data, valid_file)