In [None]:
import torch
import transformers
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

In [None]:
query_text = 'The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.'

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("philschmid/bart-large-cnn-samsum")

In [None]:
inputs = tokenizer(ex_text, return_tensors='pt')

In [None]:
prediction = model.generate(**inputs)

In [None]:
tokenizer.batch_decode(prediction)[0]

In [49]:
import gcld3

In [44]:
ex = glob('../test/resources/Edge Cases/unreliable_english_1.txt')

In [47]:
ex

['../test/resources/Edge Cases/unreliable_english_1.txt']

In [60]:
with open('../test/resources/Edge Cases/english_german.txt') as f:
    text = f.read()
    lang_model = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
    result = lang_model.FindLanguage(text)
    print(result.language, result.is_reliable, result.probability)

en True 0.9661674499511719


In [58]:
result.language, result.is_reliable, result.probability

('en', True, 0.9661674499511719)

### Testing regex

In [45]:
import re

In [46]:
ex = '<0.2> 44443.<2221>< 2020.>03.04.01.<>000 3.2.1 111.gai aaa.gai <temperatures>. <><>AAAA.<>'

In [47]:
re.sub(r'(\W^<)?\<[^>]*\>(\W^>)?', '', ex)

' 44443.03.04.01.000 3.2.1 111.gai aaa.gai . AAAA.'

### Human summary metrics for reference

In [1]:
from glob import glob
import rouge

In [2]:
news = glob('../test/resources/News Articles/*/*.txt')
summaries = glob('../test/resources/Summaries/*/*.txt')

In [3]:
news.sort(key=lambda x: x.split('/')[-1])
summaries.sort(key=lambda x: x.split('/')[-1])

In [None]:
rouge.get_scores('im blue and im hungry', 'im blue and hungry', avg=True)

In [5]:
scores_list=[]

for i,j in zip(news[1+len(news)//2:],summaries[1+len(summaries)//2:]):
    with open(i, 'r') as truth_file, open(j, 'r') as summ_file:
        scores_list.append(rouge.get_scores(truth_file.read(), summ_file.read(), avg=True))
        break


[]

In [24]:
r1_recall_list=[]
r1_precision_list=[]
r1_f1_list=[]
r2_recall_list=[]
r2_precision_list=[]
r2_f1_list=[]
rl_recall_list=[]
rl_precision_list=[]
rl_f1_list=[]

In [25]:
for i in scores_list:
    r1_recall_list.append(i['rouge-1']['r'])
    r1_precision_list.append(i['rouge-1']['p'])
    r1_f1_list.append(i['rouge-1']['f'])

    r2_recall_list.append(i['rouge-2']['r'])
    r2_precision_list.append(i['rouge-2']['p'])
    r2_f1_list.append(i['rouge-2']['f'])

    rl_recall_list.append(i['rouge-l']['r'])
    rl_precision_list.append(i['rouge-l']['p'])
    rl_f1_list.append(i['rouge-l']['f'])

In [34]:
sum(r1_recall_list)/len(r1_recall_list)

0.6644245578519652

In [35]:
sum(r1_precision_list)/len(r1_precision_list)

0.5002205025359536

In [36]:
sum(r1_f1_list)/len(r1_f1_list)

0.6644245578519652

In [37]:
sum(r2_recall_list)/len(r1_recall_list)

0.9619354436942994

In [38]:
sum(r2_precision_list)/len(r1_precision_list)

0.4327511288126716

In [39]:
sum(r2_f1_list)/len(r1_f1_list)

0.5951495525207326

In [40]:
sum(rl_recall_list)/len(r1_f1_list)

0.9975078340817972

In [41]:
sum(rl_precision_list)/len(r1_f1_list)

0.500216767361195

In [42]:
sum(rl_f1_list)/len(r1_f1_list)

0.6644196121597565