In [12]:
import pandas as pd
import numpy as np
import re
import time
import psutil
import os

In [2]:
df = pd.read_csv("./result_df_before_summary.csv")

In [3]:
df['date'] = pd.to_datetime(df['unix_timestamp'], unit='s').dt.date

In [4]:
dff = df[["request_id","query","question","date","texts"]]

In [5]:
df1 = dff[dff['request_id'].str.startswith("CrisisFACTS-001-")]

In [6]:
txt = df1["texts"][0]
cleaned_txt = re.sub(r'#\S+|https?://\S+|@\S+', '', txt)
cleaned_txt

'So long everyone   @ Tijuana International Airport  Enmascarado.  . . . . .       I\'m at San Diego International Airport -  in San Diego, CA  I\'m at San Diego International Airport -  in San Diego, CA  La Jolla, CA Via La Jolla Light Newspaper "As planes flying to and from San Diego Airport were rerouted to accommodate high winds on Dec. 7, the San Diego Regional Airport Authority met and voted to i'

In [7]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
import nltk
nltk.download('punkt_tab')

parser = PlaintextParser.from_string(cleaned_txt, Tokenizer("english"))
summarizer = LsaSummarizer()
summary = summarizer(parser.document, 1)  # 5 sentences

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
summary

(<Sentence: I'm at San Diego International Airport -  in San Diego, CA  I'm at San Diego International Airport -  in San Diego, CA  La Jolla, CA Via La Jolla Light Newspaper "As planes flying to and from San Diego Airport were rerouted to accommodate high winds on Dec. 7, the San Diego Regional Airport Authority met and voted to i>,)

In [9]:
from transformers import pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
summary = summarizer(cleaned_txt, max_length=130, min_length=30, do_sample=False)

Your max_length is set to 130, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)


In [10]:
summary[0]['summary_text']

'The San Diego Regional Airport Authority met and voted to stop flying to and from Tijuana. The decision was made after planes were rerouted to accommodate high winds.'

In [16]:
def clean_and_summarize(text):
    cleaned_txt = re.sub(r'#\S+|https?://\S+|@\S+', '', text)
    summary = summarizer(cleaned_txt, max_length=len(cleaned_txt), min_length=30, do_sample=False)
    return summary[0]['summary_text']


process = psutil.Process(os.getpid())
initial_memory = process.memory_info().rss / (1024 * 1024)
start_time = time.time()

df1['summary'] = df1['texts'].apply(clean_and_summarize)

end_time = time.time()
final_memory = process.memory_info().rss / (1024 * 1024)

elapsed_time = end_time - start_time
memory_used = final_memory - initial_memory 

minutes = int(elapsed_time // 60) 
seconds = int(elapsed_time % 60)

print(f"Time spent: {minutes} minutes and {seconds} seconds")
print(f"Memory used: {memory_used:.2f} MB")

Your max_length is set to 402, but your input_length is only 105. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=52)
Your max_length is set to 524, but your input_length is only 125. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=62)
Your max_length is set to 526, but your input_length is only 129. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=64)
Your max_length is set to 481, but your input_length is only 101. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)


Time spent: 14 minutes and 19 seconds
Memory used: 86.94 MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['summary'] = df1['texts'].apply(clean_and_summarize)


In [18]:
df1

Unnamed: 0,request_id,query,question,date,texts,summary
0,CrisisFACTS-001-r10,airport closed,Have airports closed,2017-12-14,So long everyone @ Tijuana International Air...,The San Diego Regional Airport Authority met a...
1,CrisisFACTS-001-r10,rail closed,Have railways closed,2017-12-14,The American Red Cross of San Diego/Imperial C...,The Palomar College shelter was officially clo...
2,CrisisFACTS-001-r10,water supply,Have water supplied been contaminated,2017-12-14,A UH-1Y Venom refills its bucket firefighting ...,A UH-1Y Venom refills its bucket firefighting ...
3,CrisisFACTS-001-r10,firefighters on-duty,How many firefighters are active,2017-12-14,All 17 of the firefighters on the strike team ...,All 17 of the firefighters on the strike team ...
4,CrisisFACTS-001-r10,evacuated,How many people are affected,2017-12-14,"We evacuated last Thursday, around 1pm before ...",Gas leak prompts evacuations of several homes ...
...,...,...,...,...,...,...
429,CrisisFACTS-001-r9,wind speed,Where are wind speeds expected to be high,2017-12-13,Northeast winds are expected to reach speeds o...,Northeast winds are expected to reach speeds o...
430,CrisisFACTS-001-r9,helicopters,Are helicopters available,2017-12-13,The #SanDiego City Council has unanimously vot...,The City Council has unanimously voted to spen...
431,CrisisFACTS-001-r9,homes destroyed damaged,Where have homes been damaged or destroyed,2017-12-13,"So far, 701 homes, two apartment complexes and...",San Diego County is under a red flag warning. ...
432,CrisisFACTS-001-r9,acres per hour,How quickly is the fire spreading,2017-12-13,Cal Fire reports that CALFIRE & the US Forest ...,A series of wildfires the last two weeks scorc...
