In [25]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import os
import warnings

In [26]:
df = pd.read_csv("./result_df_before_summary.csv")

In [27]:
df['date'] = pd.to_datetime(df['unix_timestamp'], unit='s').dt.date

In [28]:
dff = df[["request_id","query","question","date","texts"]]

In [29]:
df1 = dff[dff['request_id'].str.startswith("CrisisFACTS-001-")]

In [30]:
txt = df1["texts"][0]
cleaned_txt = re.sub(r'#\S+|https?://\S+|@\S+', '', txt)
cleaned_txt

'So long everyone   @ Tijuana International Airport  Enmascarado.  . . . . .       I\'m at San Diego International Airport -  in San Diego, CA  I\'m at San Diego International Airport -  in San Diego, CA  La Jolla, CA Via La Jolla Light Newspaper "As planes flying to and from San Diego Airport were rerouted to accommodate high winds on Dec. 7, the San Diego Regional Airport Authority met and voted to i'

In [31]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import nltk
nltk.download('punkt_tab')

parser = PlaintextParser.from_string(cleaned_txt, Tokenizer("english"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, 1)  # 5 sentences

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [32]:
summary

(<Sentence: I'm at San Diego International Airport -  in San Diego, CA  I'm at San Diego International Airport -  in San Diego, CA  La Jolla, CA Via La Jolla Light Newspaper "As planes flying to and from San Diego Airport were rerouted to accommodate high winds on Dec. 7, the San Diego Regional Airport Authority met and voted to i>,)

In [51]:
from transformers import pipeline
bart_summarizer = pipeline("summarization", model="facebook/bart-large-xsum")
pega_summarizer = pipeline("summarization", model="google/pegasus-xsum")
summary1 = bart_summarizer(cleaned_txt, max_length=52, min_length=30)
summary2 = pega_summarizer(cleaned_txt, max_length=52, min_length=30)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
print(summary1[0]['summary_text'])
print(summary2[0]['summary_text'])

An airport in Tijuana, Mexico, has reopened after being closed for more than 24 hours due to high winds on Christmas Eve and Christmas Day.
High winds have forced the closure of San Diego's international airport for the second time in a week, with flights diverted to Tijuana, Mexico.


In [54]:
from tqdm import tqdm

def clean_and_summarize(text, summarizer):
    cleaned_txt = re.sub(r'#\S+|https?://\S+|@\S+', '', text)
    summary = summarizer(cleaned_txt, max_length=52, min_length=30)
    return summary[0]['summary_text']


process = psutil.Process(os.getpid())
initial_memory = process.memory_info().rss / (1024 * 1024)
start_time = time.time()

tqdm.pandas(desc="Processing texts")
# df1['summary'] = df1['texts'].apply(clean_and_summarize)
df1['bart_summary'] = df1['texts'].apply(lambda x: clean_and_summarize(x, bart_summarizer))

end_time = time.time()
final_memory = process.memory_info().rss / (1024 * 1024)

elapsed_time = end_time - start_time
memory_used = final_memory - initial_memory 

minutes = int(elapsed_time // 60) 
seconds = int(elapsed_time % 60)

print(f"Time spent: {minutes} minutes and {seconds} seconds")
print(f"Memory used: {memory_used:.2f} MB")

Your max_length is set to 52, but your input_length is only 47. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 52, but your input_length is only 20. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=10)
Your max_length is set to 52, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your max_length is set to 52, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)
Your max

Time spent: 42 minutes and 40 seconds
Memory used: 0.25 MB


In [55]:
df1

Unnamed: 0,request_id,query,question,date,texts,bart_summary
0,CrisisFACTS-001-r10,airport closed,Have airports closed,2017-12-14,So long everyone @ Tijuana International Air...,"An airport in Tijuana, Mexico, has reopened af..."
1,CrisisFACTS-001-r10,rail closed,Have railways closed,2017-12-14,The American Red Cross of San Diego/Imperial C...,The Camp Fire in San Diego has been fully cont...
2,CrisisFACTS-001-r10,water supply,Have water supplied been contaminated,2017-12-14,A UH-1Y Venom refills its bucket firefighting ...,Firefighters in California have been using wat...
3,CrisisFACTS-001-r10,firefighters on-duty,How many firefighters are active,2017-12-14,All 17 of the firefighters on the strike team ...,One of the firefighters killed in the Camp fir...
4,CrisisFACTS-001-r10,evacuated,How many people are affected,2017-12-14,"We evacuated last Thursday, around 1pm before ...",My wife and I were forced to evacuate our home...
...,...,...,...,...,...,...
429,CrisisFACTS-001-r9,wind speed,Where are wind speeds expected to be high,2017-12-13,Northeast winds are expected to reach speeds o...,The National Weather Service has issued a yell...
430,CrisisFACTS-001-r9,helicopters,Are helicopters available,2017-12-13,The #SanDiego City Council has unanimously vot...,Firefighters in Los Angeles will soon be able ...
431,CrisisFACTS-001-r9,homes destroyed damaged,Where have homes been damaged or destroyed,2017-12-13,"So far, 701 homes, two apartment complexes and...",The number of homes destroyed by wildfires in ...
432,CrisisFACTS-001-r9,acres per hour,How quickly is the fire spreading,2017-12-13,Cal Fire reports that CALFIRE & the US Forest ...,The number of wildfires burning in the US stat...
