### Get sentence similarity
read://https_towardsdatascience.com/?url=https%3A%2F%2Ftowardsdatascience.com%2Fintroduction-to-text-summarization-with-rouge-scores-84140c64b471



In [6]:
import torchtext
import rouge
from vocab import Vocab
import os

In [2]:
ref = 'this person studied and loves the field of data science very much'
can = 'this person studied the field of data science'

rouge_score = rouge.Rouge()
score = rouge_score.get_scores(ref, can)

print(score)

print(score[0]['rouge-2']['f'])


[{'rouge-1': {'r': 1.0, 'p': 0.6666666666666666, 'f': 0.7999999952000001}, 'rouge-2': {'r': 0.8571428571428571, 'p': 0.5454545454545454, 'f': 0.6666666619135801}, 'rouge-l': {'r': 1.0, 'p': 0.6666666666666666, 'f': 0.7999999952000001}}]
0.6666666619135801


In [3]:
def _get_oracle_ids(sentences, highlights, summary_len = -1, include_rouge_l = False, rouge_threshold = 0.6):
    rouge_cal = rouge.Rouge()
    score_by_index = []

    def _get_rouge_total_score(score, include_rouge_l = include_rouge_l):
        return score[0]['rouge-1']['f'] + score[0]['rouge-2']['f'] + (score[0]['rouge-l']['f'] if include_rouge_l else 0)

    for i in range(len(sentences)):
        for j in range(len(highlights)):
            score = rouge_cal.get_scores(sentences[i], highlights[j])
            total_score = _get_rouge_total_score(score, include_rouge_l)
            score_by_index.append((i, j, total_score))
    
    if summary_len == -1: 
        score_by_index = filter(lambda x: x[2] >= rouge_threshold, score_by_index)
    else:
        score_by_index = sorted(score_by_index, key = lambda x: x[2], reverse=True)
        score_by_index = score_by_index[:summary_len]
    
    score_by_index = map(lambda x: x[0], score_by_index)
    score_by_index = list(score_by_index)
    
    return sorted(score_by_index)

def _load_data(articles):
    tokenizer = torchtext.data.get_tokenizer('spacy')
    tokens = []

    for article in articles:
        tokens += tokenizer(' '.join(article))
    
    #NOTE word freq can affect performance down stream. Put attention to this param
    vocab = Vocab(tokens, 3, reserved_tokens=['<cls>', '<sep>', '<pad>'])
    return tokenizer, vocab

def _save_vocab(vocab, vocab_path = None):
    if vocab_path == None:
        output_root = './output'
        processed_root = os.path.join(output_root, 'processed')
        vocab_path = os.path.join(processed_root, 'vocab.txt')

    if os.path.exists(processed_root) == False: 
        os.makedirs(processed_root)
    vocab.write_to(vocab_path)


def _batchify(data):
    articles = data['article']



    articles
    





In [4]:
article = '''By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in Italy last month. Symptoms of hepatitis A include fever, tiredness, loss of appetite, nausea and abdominal discomfort. Fargo Catholic Diocese in North Dakota (pictured) is where the bishop is located .'''
highlight = '''Bishop John Folda, of North Dakota, is taking time off after being diagnosed.He contracted the infection through contaminated food in Italy. Church members in Fargo, Grand Forks and Jamestown could have been exposed .'''

tokenizer = torchtext.data.get_tokenizer('spacy')

sent = article.split('.')
labels = highlight.split('.')

for i in range(len(sent)):
    sent[i] = sent[i].lower().strip()

for i in range(len(labels)):
    labels[i] = labels[i].lower().strip()

sent = list(filter(lambda x: len(tokenizer(x)) >= 10, sent))
labels = list(filter(lambda x: len(tokenizer(x)) >= 3, labels))

print(sent)
print(labels)

sum_len = len(labels) if len(labels) < len(sent) else -1
oracle_ids = _get_oracle_ids(sent, labels, summary_len = sum_len)
print(oracle_ids)

for idx in oracle_ids:
    print(sent[idx])

for l in labels:
    print(l)




['the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october', 'the state health department has issued an advisory of exposure for anyone who attended five churches and took communion', 'bishop john folda (pictured) of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a', "state immunization program manager molly howell says the risk is low, but officials feel it's important to alert people to the possible exposure", 'the diocese announced on monday that bishop john folda is taking time off after being diagnosed with hepatitis a', 'the diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in italy last month', 'symptoms of hepatitis a include fever, tiredness, loss of appetite, n