In [26]:
import pandas as pd
from transformers import pipeline
from tqdm.notebook import tqdm
import numpy as np
tqdm.pandas()

In [27]:
def summarize_with_error_handling(summarization_pipeline, input_text, max_char:int):
    try:
        return summarization_pipeline(input_text[:max_char], max_length=200, min_length=100, do_sample=False)[0]['summary_text']
    except Exception as e:
        print("The following error occured, returned empty string")
        print(e)
        return ""

In [28]:
data_silver = pd.read_csv('../temp_training/medallion/silver_2023-10-06_02-39-12.csv')
print(data_silver.columns)
print(data_silver.groupby(['TOPIC']).size())
print(data_silver.groupby(['RELEVANCE_CLASS']).size())
# print(data_silver.groupby(['ARTICLE_HIERARCHY']).size())

Index(['INSERT_DATETIME', 'URI', 'TOPIC', 'TOPIC_URI', 'TITLE', 'BODY', 'URL',
       'RELEVANCE_CLASS'],
      dtype='object')
TOPIC
air                     169
marine                  301
material                121
protest_riot            460
road                     22
strike                  179
train                   201
warehouse_fire          213
weather_cyclone          13
weather_generalnews     215
weather_naturalevent    199
dtype: int64
RELEVANCE_CLASS
0    1404
1     689
dtype: int64


In [29]:
# check which articles have been summarized from yesterday run
# and exclude them from today's run
import os
import pandas as pd

root_dir = './chunk_summary_output'
os.listdir(root_dir)
df_list = []
for file_name in os.listdir(root_dir):
    file_path = os.path.join(root_dir, file_name)
    df_chunk = pd.read_csv(file_path)
    df_list.append(df_chunk)
df_combined = pd.concat(df_list)

In [30]:
# sample_df = data_silver.sample(n=5, random_state=1)
# sample_df = data_silver.copy(deep=True)
sample_df = data_silver[~data_silver['URI'].isin(df_combined['URI'])]
print(f"total data size: {data_silver.shape}")
print(f"summarized data size: {df_combined.shape}")
print(f"remaining data: {sample_df.shape}")


total data size: (2093, 8)
summarized data size: (1350, 9)
remaining data: (743, 8)


In [31]:
loaded_summarizer = pipeline("summarization", model='../models/pretrained/bart-large-cnn/')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
import warnings 
from datetime import datetime

# Settings the warnings to be ignored 
warnings.filterwarnings('ignore')

# Split the large DataFrame into smaller chunks
chunk_size = 50
chunks = [sample_df[i:i + chunk_size] for i in range(0, len(sample_df), chunk_size)]
# Create an empty list to store the results
results = []

for i, chunk in enumerate(tqdm(chunks)):
    chunk['BODY_SUMMARY'] = chunk['BODY'].progress_apply(lambda body_text: summarize_with_error_handling(loaded_summarizer, body_text, 2000))
    results.append(chunk)
    chunk.to_csv(f'./chunk_summary_output_02/chunk_{i+1}.csv', index=False)
    
result_df = pd.concat(results)
result_df.to_csv(f'./chunk_summary_output_02/total_result.csv', index=False)

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 189. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)
Your max_length is set to 200, but your input_length is only 158. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 156. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=78)
Your max_length is set to 200, but your input_length is only 138. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=69)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 189. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=94)
Your max_length is set to 200, but your input_length is only 147. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 191. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=95)
Your max_length is set to 200, but your input_length is only 195. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 183. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=91)
Your max_length is set to 200, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 175. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=87)
Your max_length is set to 200, but your input_length is only 148. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=74)
Your max_length is set to 200, but your input_length is only 111. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=55)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 180. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 200, but your input_length is only 160. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=80)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 115. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=57)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 171. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 200, but your input_length is only 142. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=71)
Your max_length is set to 200, but your input_length is only 122. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=61)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)
Your max_length is set to 200, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 200, but your input_length is only 166. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=83)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 76. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=38)
Your max_length is set to 200, but your input_length is only 149. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=74)
Your max_length is set to 200, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)
Your max_length is set to 200, but your input_length is only 187. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 187. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)
Your max_length is set to 200, but your input_length is only 159. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 200, but your input_length is only 184. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=92)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 159. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=79)
Your max_length is set to 200, but your input_length is only 194. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)
Your max_length is set to 200, but your input_length is only 147. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=73)
Your max_length is set to 200, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 124. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)


  0%|          | 0/50 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 127. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=63)
Your max_length is set to 200, but your input_length is only 170. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=85)
Your max_length is set to 200, but your input_length is only 151. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=75)
Your max_length is set to 200, but your input_length is only 186. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)


  0%|          | 0/43 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 165. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=82)
Your max_length is set to 200, but your input_length is only 116. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
Your max_length is set to 200, but your input_length is only 84. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=42)
Your max_length is set to 200, but your input_length is only 172. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=86)
Y

In [34]:
result_df.head(5)

Unnamed: 0,INSERT_DATETIME,URI,TOPIC,TOPIC_URI,TITLE,BODY,URL,RELEVANCE_CLASS,BODY_SUMMARY
1350,2023-09-12 04:02:07.878,7433169857,protest_riot,985cec5c-091d-44bc-a739-8d49b836f7ce,Tunisians defy protest ban and rally against p...,Hundreds of Tunisians rallied in defiance agai...,https://www.euronews.com/2023/03/06/tunisians-...,0,Protesters demand the release of more than 20 ...
1351,2023-09-12 04:02:07.878,7433164479,material,e5e76cd2-23a4-43a3-b1db-6c98860feaba,WRAPUP 1-South African food producers expect m...,"AVI, which also sells clothes, managed to clin...",https://www.devdiscourse.com/article/headlines...,0,RCL Foods and AVI said demand was expected to ...
1352,2023-09-12 04:02:07.878,7433148494,material,e5e76cd2-23a4-43a3-b1db-6c98860feaba,Polestar Automotive is viewed cautiously by De...,Deutsche Bank kept a cautious view on Polestar...,https://seekingalpha.com/news/3944373-polestar...,0,Deutsche Bank kept a cautious view on Polestar...
1353,2023-09-12 04:02:07.878,7433117815,train,3b27edc7-6ce6-4216-bc47-5693f57b052e,"Greek railway strikes continue, EU pledges hel...",Wagner head warns of risk to Bakhmut positions...,https://www.dailysabah.com/world/europe/greek-...,1,Greek rail workers go on strike for a sixth da...
1354,2023-09-12 04:02:07.878,7433127646,train,3b27edc7-6ce6-4216-bc47-5693f57b052e,Sorrow gives way to anger in Greece's worst ev...,"Larissa, Greece - Relatives of those killed in...",https://www.aljazeera.com/news/2023/3/6/sorrow...,1,Greece's worst-ever train accident has left 57...


In [7]:
# # summarizing the news
# sample_df['BODY_SUMMARY'] = sample_df['BODY'].progress_apply(
#     lambda body_text: summarize_with_error_handling(loaded_summarizer, body_text, 2000))

# sample_df.to_csv('gold_NEWS_API_ML_LABELLED_summary_02.csv')

  0%|          | 0/29 [00:00<?, ?it/s]

Your max_length is set to 200, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 200, but your input_length is only 180. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=90)
Your max_length is set to 200, but your input_length is only 135. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=67)
Your max_length is set to 200, but your input_length is only 187. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=93)


### Combining all 

In [35]:
root_dir = './chunk_summary_output'
os.listdir(root_dir)
df_list = []
for file_name in os.listdir(root_dir):
    file_path = os.path.join(root_dir, file_name)
    df_chunk = pd.read_csv(file_path)
    df_list.append(df_chunk)
df_combined = pd.concat(df_list)
df_combined_02 = result_df.copy(deep=True)

In [36]:
df_news_api_training = pd.concat([df_combined, df_combined_02])

In [37]:
df_news_api_training

Unnamed: 0,INSERT_DATETIME,URI,TOPIC,TOPIC_URI,TITLE,BODY,URL,RELEVANCE_CLASS,BODY_SUMMARY
0,2023-09-12 04:09:07.586,7514939005,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,Concrete Block Plant Controversy Builds,Proponents and opponents of a controversial zo...,https://stjohntradewinds.com/concrete-block-pl...,0,An online petition against the concrete block ...
1,2023-09-12 04:09:07.586,7514909363,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,Investigations underway after 2 pest control w...,"POMPANO BEACH, FLA. (WSVN) - Police are invest...",https://wsvn.com/news/local/broward/investigat...,1,Three employees of a pesticide company went to...
2,2023-09-12 04:09:07.586,7514805717,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,Employee treated for smoke inhalation after fi...,An employee of a document destruction warehous...,https://www.chicagotribune.com/suburbs/aurora-...,1,A fire broke out at a document destruction war...
3,2023-09-12 04:09:07.586,7514763894,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,Zero toxic compounds identified after Wissinom...,PHILADELPHIA (CBS) -- No toxic compounds were ...,https://www.cbsnews.com/philadelphia/news/phil...,1,The city collected an air sample in the vicini...
4,2023-09-12 04:09:07.586,7514549341,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,1 dead in suburban Chicago petroleum plant exp...,Fox News Flash top headlines are here. Check o...,https://www.foxnews.com/us/1-dead-suburban-chi...,1,The explosion occurred at Seneca Petroleum in ...
...,...,...,...,...,...,...,...,...,...
2088,2023-09-12 04:02:07.878,7398330361,protest_riot,985cec5c-091d-44bc-a739-8d49b836f7ce,Delhi street vendors protest against civic bod...,NEW DELHI: Hundreds of street vendors from acr...,https://timesofindia.indiatimes.com/city/delhi...,0,Hundreds of street vendors from across the cap...
2089,2023-09-12 04:02:07.878,7398254799,protest_riot,985cec5c-091d-44bc-a739-8d49b836f7ce,"Trains packed, thousands of vehicles join conv...",A packed train from Tel Aviv to Jerusalem as p...,https://www.timesofisrael.com/liveblog_entry/t...,0,Tens of thousands of people are participating ...
2090,2023-09-12 04:02:07.878,7398231770,strike,3c6e51c0-87a5-4343-9fd9-76b1edd597cf,Easter holiday warning as more airport staff s...,FAMILIES heading abroad during the Easter holi...,https://www.thesun.co.uk/travel/21365752/easte...,0,"More than 3,000 workers are calling to strike ..."
2091,2023-09-12 04:02:07.878,7397746307,air,f5e25ca7-4d7a-475e-89ab-7d45d5fa1365,"Auckland Airport warns worst is yet to come, 4...",Travellers are being warned about the importan...,https://www.nzherald.co.nz/business/cyclone-ga...,1,80 per cent of international flights through A...


In [40]:
# in this case, the "features" extracted from the silver dataset is the BODY_SUMMARY that is used for zeroshot/fewshot learning
# also, the column 'RELEVANCE_CLASS' is moved to the end as a good practice.
schema_gold = ['INSERT_DATETIME', 'URI', 'TOPIC', 'TOPIC_URI', 'EVENTURI', 'SOURCE', 'URL', 'TITLE', 'BODY', 'BODY_SUMMARY', 'METADATA', 'ARTICLE_HIERARCHY', 'RELEVANCE_CLASS']
# REARRANGE the df to the column order in the schema
# Check the columns from the source df and the schema, if there is a missing column, create it and fill it with empty string
df_ingestion = df_news_api_training.copy(deep=True)
data_columns = df_ingestion.columns

for column in schema_gold:
    if column not in (data_columns):
        df_ingestion[column] = ''

# rearrange to the order we want in the schema
df_ingestion[schema_gold]


Unnamed: 0,INSERT_DATETIME,URI,TOPIC,TOPIC_URI,EVENTURI,SOURCE,URL,TITLE,BODY,BODY_SUMMARY,METADATA,ARTICLE_HIERARCHY,RELEVANCE_CLASS
0,2023-09-12 04:09:07.586,7514939005,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,,,https://stjohntradewinds.com/concrete-block-pl...,Concrete Block Plant Controversy Builds,Proponents and opponents of a controversial zo...,An online petition against the concrete block ...,,,0
1,2023-09-12 04:09:07.586,7514909363,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,,,https://wsvn.com/news/local/broward/investigat...,Investigations underway after 2 pest control w...,"POMPANO BEACH, FLA. (WSVN) - Police are invest...",Three employees of a pesticide company went to...,,,1
2,2023-09-12 04:09:07.586,7514805717,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,,,https://www.chicagotribune.com/suburbs/aurora-...,Employee treated for smoke inhalation after fi...,An employee of a document destruction warehous...,A fire broke out at a document destruction war...,,,1
3,2023-09-12 04:09:07.586,7514763894,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,,,https://www.cbsnews.com/philadelphia/news/phil...,Zero toxic compounds identified after Wissinom...,PHILADELPHIA (CBS) -- No toxic compounds were ...,The city collected an air sample in the vicini...,,,1
4,2023-09-12 04:09:07.586,7514549341,warehouse_fire,eb7688e0-0c9f-4e9a-a2b8-2ba000a8f11b,,,https://www.foxnews.com/us/1-dead-suburban-chi...,1 dead in suburban Chicago petroleum plant exp...,Fox News Flash top headlines are here. Check o...,The explosion occurred at Seneca Petroleum in ...,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2088,2023-09-12 04:02:07.878,7398330361,protest_riot,985cec5c-091d-44bc-a739-8d49b836f7ce,,,https://timesofindia.indiatimes.com/city/delhi...,Delhi street vendors protest against civic bod...,NEW DELHI: Hundreds of street vendors from acr...,Hundreds of street vendors from across the cap...,,,0
2089,2023-09-12 04:02:07.878,7398254799,protest_riot,985cec5c-091d-44bc-a739-8d49b836f7ce,,,https://www.timesofisrael.com/liveblog_entry/t...,"Trains packed, thousands of vehicles join conv...",A packed train from Tel Aviv to Jerusalem as p...,Tens of thousands of people are participating ...,,,0
2090,2023-09-12 04:02:07.878,7398231770,strike,3c6e51c0-87a5-4343-9fd9-76b1edd597cf,,,https://www.thesun.co.uk/travel/21365752/easte...,Easter holiday warning as more airport staff s...,FAMILIES heading abroad during the Easter holi...,"More than 3,000 workers are calling to strike ...",,,0
2091,2023-09-12 04:02:07.878,7397746307,air,f5e25ca7-4d7a-475e-89ab-7d45d5fa1365,,,https://www.nzherald.co.nz/business/cyclone-ga...,"Auckland Airport warns worst is yet to come, 4...",Travellers are being warned about the importan...,80 per cent of international flights through A...,,,1


In [41]:
df_ingestion.to_csv('../temp_training/medallion/gold/NEWS_API_TRAINING_GOLD.csv')