In [1]:
import pandas as pd
from transformers import pipeline
from tqdm.notebook import tqdm
import numpy as np
tqdm.pandas()

In [2]:
def summarize_with_error_handling(summarization_pipeline, input_text, max_char:int):
    try:
        return summarization_pipeline(input_text[:max_char], max_length=200, min_length=100, do_sample=False)[0]['summary_text']
    except Exception as e:
        print("The following error occured, returned empty string")
        print(e)
        return ""

In [3]:
data_silver = pd.read_csv('../temp_training/medallion/silver_2023-10-23_08-33-20.csv')
print(data_silver.columns)
print(data_silver.groupby(['TOPIC']).size())
print(data_silver.groupby(['ARTICLE_HIERARCHY']).size())

Index(['INSERT_DATETIME', 'URI', 'TOPIC', 'TOPIC_URI', 'EVENTURI', 'SOURCE',
       'URL', 'TITLE', 'BODY', 'RELEVANCE_CLASS', 'METADATA',
       'ARTICLE_HIERARCHY'],
      dtype='object')
TOPIC
air               69
forced_labor       1
warehouse_fire     1
dtype: int64
ARTICLE_HIERARCHY
main_article       23
related_article    48
dtype: int64


In [5]:
# main_article_silver = data_silver[data_silver['ARTICLE_HIERARCHY']=='main_article']
# main_article_silver.groupby(['TOPIC']).size()

In [4]:
loaded_summarizer = pipeline("summarization", model='../models/pretrained/bart-large-cnn/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
sample_df = data_silver.copy(deep=True)

In [7]:
# summarizing the news
sample_df['BODY_SUMMARY'] = sample_df['BODY'].progress_apply(
    lambda body_text: summarize_with_error_handling(loaded_summarizer, body_text, 2000))

sample_df.to_csv('gold_NEWS_API_ML_LABELLED_summary.csv')

  0%|          | 0/71 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 194. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 200, but your input_length is only 178. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=89)
Your max_length is set to 200, but your input_length is only 161. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=80)
Your max_length is set to 200, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
