In [25]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [26]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-cnn_dailymail')
tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-cnn_dailymail')

text = '''The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.                                       
'''
input_ids = tokenizer.encode(text, return_tensors='pt')
output = model.generate(input_ids)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

print(summary)



The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris.<n>Its base is square, measuring 125 metres (410 ft) on each side.<n>During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world.


In [27]:
data = pd.read_csv("data/labeled01.csv", encoding='cp1252')
data = data.dropna()
print(data.shape)

(955, 2)


In [28]:
data.head()

Unnamed: 0,resolution,summary
0,resolution of the miami city commission accept...,The Miami City Commission accepted bids from r...
1,resolution of the miami city commission findin...,The Miami City Commission found that the COVID...
2,resolution of the miami city commission author...,Miami City Commission authorizes city manager ...
3,"resolution of the miami city commission, with ...",Miami City Commission accepts perpetual sidewa...
4,"resolution of the miami city commission, with ...",Miami City Commission accepts two right-of-way...


In [29]:
# stop_words = set(stopwords.words('english')) 
def text_cleaner(text,num):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text #remove links
    # newString = re.sub(r'\([^)]*\)', '', newString) # remove text in the paranthesis
    newString = re.sub('_','', newString) # removing underscores
    newString = re.sub('"','', newString) # removing double quotes
    newString = re.sub('/',' ', newString) # removing double quotes
    newString = re.sub('-','', newString) # removing double quotes
    newString = re.sub(r'[^\x00-\x7F]+', '', newString) # removes non-ASCII characters
    # newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString) #remove possesive s
    # newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString)
    # if(num==0):
    #     tokens = [w for w in newString.split() if not w in stop_words]
    # else:
    #     tokens=newString.split()
    tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                                                 #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [30]:
cnt=0
text = data['resolution']
results_df = pd.DataFrame(columns=['resolution', 'summary'])
for resolution in text:
    cnt+=1
    # print(cnt)
    # resolution = text_cleaner(resolution,0)
    try:  
        text = text_cleaner(resolution,0)
        input_ids = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=800, truncation=True)
        output = model.generate(input_ids, max_new_tokens=50, min_length=10)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
    except:
        summary = None
    new_row = pd.DataFrame({'resolution': text_cleaner(resolution,0), 'summary': summary}, index=[0])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    print(summary)

City commission meeting agenda January 13, 2022 city of Miami page printed on 2022 ca.2 11189 department of real estate and asset management resolution.<n>Bids received on February 18, 2021 pursuant to invitation for bid (ifb) no. 125
The commission finds that the novel coronavirus 2019 pandemic has caused force majeure del ay of the performance of riverside wharf.<n>The commission authorizes an extension of the four (4) year deadline to obtain necessary building permits contained in section 29 of the charter
resolution authorizing the city manager to negotiate and execute memorandum of agreement.<n>Florida department of transportation (fdot) to install wayfinding iosks on fdot right ofway within the city of Miami.
Miami city commission accepts perpetual sidewalk easement.<n>Portions of sidewalks constructed on northwest 1st avenue, northwest 3rd street, northwest street, and northwest 6th street, Miami, lorida.
Miami city commission accepted two right ofway deeds of dedication.<n>The 

In [31]:
print(data.shape)

(955, 2)


In [32]:
print(results_df.shape)

(955, 2)


In [33]:
results_df2 = results_df.dropna()
print(results_df2.shape)

(955, 2)


In [34]:
results_df2.to_csv('data/summaries_pgs.csv', index=False)