# Text Summarisation

Here we are using Ray to parralell process the summary of the text. These are very intensive models which require a lot of computation.

In [1]:
import ray
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline

rows_to_process = 10000

# Initialize Ray
ray.init(ignore_reinit_error=True, include_dashboard=True)

@ray.remote
def summarize_text(text, tokenizer_ref, model_ref, max_length=450, min_length=100, do_sample=False):
    # Create a summarization pipeline
    summarizer = pipeline("summarization", model=model_ref, tokenizer=tokenizer_ref)
    
    word_count = len(text.split())  # Calculate word count
    
    # If text is shorter than 400 words, return the original text
    if word_count <= 300:
        return text
    
    # Summarize the text
    result = summarizer(text, max_length=max_length, min_length=min_length, truncation=True)
    
    # Return the summarized text
    return result[0]['summary_text']



2024-09-29 08:29:12,390	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
[36m(summarize_text pid=30856)[0m Your max_length is set to 512, but your input_length is only 482. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=241)
[36m(summarize_text pid=18460)[0m Your max_length is set to 512, but your input_length is only 504. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=252)
[36m(summarize_text pid=2556)[0m Your max_length is set to 512, but your input_length is only 509. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=254)

Once we have initialised Ray and define the job if should do with @ray.remote, we can then load the model and process the summaries

In [2]:
# Load the BART model and tokenizer for summarization
model_name = "../models/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Put the tokenizer and model into Ray's object store for distributed processing
tokenizer_ref = ray.put(tokenizer)
model_ref = ray.put(model)

# Load the dataset (update the path to your dataset)
df_full = pd.read_csv('../data/raw/Books_rating.csv')

# Use a subset of rows for the sake of this example
df = df_full[:rows_to_process]

# Submit each text for parallel summarization via Ray
futures = [summarize_text.remote(text, tokenizer_ref, model_ref) for text in df['review/text'].tolist()]

# Collect results
summaries = ray.get(futures)



We then save these in a csv containing the Id. The row reference will provide a method by which to do a join with the original data.

In [3]:
# Create a new DataFrame with just the 'id' column
df_summary = pd.DataFrame(df['Id'])

# Add the summaries directly to this smaller DataFrame
df_summary['summary'] = summaries

output_path = f'../data/processed/Books_bart-large-summary_{rows_to_process}_rows_max_length_csv'
df_summary.to_csv(output_path, index=False)

# Shut down Ray
ray.shutdown()
