In [38]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [39]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model = T5ForConditionalGeneration.from_pretrained('t5-base')
tokenizer = T5Tokenizer.from_pretrained('t5-base')

text = '''The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.                                       
'''
input_ids = tokenizer.encode("summarize: " + text, return_tensors='pt')
output = model.generate(input_ids)
summary = tokenizer.decode(output[0], skip_special_tokens=True)

print(summary)

the tower is 324 metres (1,063 ft) tall and measures 125


In [40]:
data = pd.read_csv("data/labeled01.csv", encoding='cp1252')
data = data.dropna()
print(data.shape)

(955, 2)


In [41]:
data.head()

Unnamed: 0,resolution,summary
0,resolution of the miami city commission accept...,The Miami City Commission accepted bids from r...
1,resolution of the miami city commission findin...,The Miami City Commission found that the COVID...
2,resolution of the miami city commission author...,Miami City Commission authorizes city manager ...
3,"resolution of the miami city commission, with ...",Miami City Commission accepts perpetual sidewa...
4,"resolution of the miami city commission, with ...",Miami City Commission accepts two right-of-way...


In [42]:
# stop_words = set(stopwords.words('english')) 
def text_cleaner(text,num):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text #remove links
    # newString = re.sub(r'\([^)]*\)', '', newString) # remove text in the paranthesis
    newString = re.sub('_','', newString) # removing underscores
    newString = re.sub('"','', newString) # removing double quotes
    newString = re.sub('/',' ', newString) # removing double quotes
    newString = re.sub('-','', newString) # removing double quotes
    newString = re.sub(r'[^\x00-\x7F]+', '', newString) # removes non-ASCII characters
    # newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString) #remove possesive s
    # newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString)
    # if(num==0):
    #     tokens = [w for w in newString.split() if not w in stop_words]
    # else:
    #     tokens=newString.split()
    tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                                                 #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()

In [43]:
cnt=0
text = data['resolution']
results_df = pd.DataFrame(columns=['resolution', 'summary'])
for resolution in text:
    cnt+=1
    # print(cnt)
    # resolution = text_cleaner(resolution,0)
    try:  
        text = text_cleaner(resolution,0)
        input_ids = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=800, truncation=True)
        output = model.generate(input_ids, max_new_tokens=50, min_length=10)
        summary = tokenizer.decode(output[0], skip_special_tokens=True)
    except:
        summary = None
    new_row = pd.DataFrame({'resolution': text_cleaner(resolution,0), 'summary': summary}, index=[0])
    results_df = pd.concat([results_df, new_row], ignore_index=True)
    print(summary)

miami city commission accepts bids for bio hazardous waste, spills, decontamination services. city commission approves bids for two (2) years with option to renew. city council approves january 13 city commission meeting
miami city commission finds coronavirus 2019 pandemic caused force majeure del ay. resolution authorizing extension of deadline to obtain building permits for project. new: city manager to take any and all actions necessary to
miami city commission authorizes city manager to negotiate and execute memorandum of agreement. city manager to negotiate with florida department of transportation for wayfinding iosks. resolution deferred until jan
miami city commission accepts perpetual sidewalk easement. resolution allows public pedestrian access to sidewalks. resolution also includes a statewide ordinance.
miami city commission accepts two (2) right ofway deeds of dedication. city commission approves recordation of deeds in public records of miami dade county. miami dade coun

In [44]:
print(data.shape)

(955, 2)


In [45]:
print(results_df.shape)

(955, 2)


In [46]:
results_df2 = results_df.dropna()
print(results_df2.shape)

(955, 2)


In [47]:
results_df2.to_csv('data/summaries_t5.csv', index=False)