# ROUGE
- Recall-Oriented Understudy of Gisting Evaluation
- 생성된 요약 문장의 품질을 참조 문장과 비교하여 측정하는 메트릭

In [122]:
from newspaper import Article
from gensim.summarization import summarize
import pandas as pd
from rouge import Rouge
from pprint import pprint
import string
import re
import nltk
import pandas as pd

## 함수 정의

- clean text
    - 대문자 -> 소문자
    - punctuation 제거
    - 불용어 제거

In [139]:
def clean_text(raw_text):
    text = raw_text.lower()
    text = re.sub('['+string.punctuation+']', '', text)
    text_list = text.split()
    removed = [
        w for w in text_list if w not in nltk.corpus.stopwords.words('english')]
    return ' '.join(removed)

def get_rouge_result(system, reference, clean=False):
    if clean:
        system = clean_text(system)
        reference = clean_text(reference)
        print('Cleaned')
    else:
        print('No Clean')
    rouge = Rouge()
    scores = rouge.get_scores(system, reference)
    
    return pd.DataFrame(scores[0])

## 쉬운 예시

1. ROUGE-1 : **unigram**
2. ROUGE-2 : **bigram**
3. ROUGE-l : **LCS** (Longest Common Sequence) 기법

In [142]:
system = "hello a cat dog fox the jumps"
reference = "the fox jumps"

display(get_rouge_result(system, reference))
display(get_rouge_result(system, reference, clean=True))

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.6,0.0,0.4
p,0.428571,0.0,0.285714
r,1.0,0.0,0.666667


Cleaned


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.571429,0.4,0.571429
p,0.4,0.25,0.4
r,1.0,1.0,1.0


## CNN news 1

In [143]:
original_title_1 = "Barcelona remains committed to Super League, saying it would be 'historical error' to pull out"
extracted_sent_1 = "Ten of the 12 founding members might have officially pulled out of the European Super League, but Barcelona has reiterated its commitment to the idea."

display(get_rouge_result(extracted_sent_1, original_title_1))
display(get_rouge_result(extracted_sent_1, original_title_1, clean=True))

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.25,0.052632,0.166667
p,0.2,0.041667,0.136364
r,0.333333,0.071429,0.214286


Cleaned


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.25,0.090909,0.166667
p,0.214286,0.076923,0.142857
r,0.3,0.111111,0.2


## CNN news 2

In [144]:
original_title_2 = "SpaceX rocket carrying four astronauts launches from Florida"
extracted_sent_2 = "A SpaceX Crew Dragon spacecraft — carrying four astronauts from three countries — took off from NASA's Kennedy Space Center in Florida Friday morning, beginning their six-month stay in space."

display(get_rouge_result(extracted_sent_2, original_title_2))
display(get_rouge_result(extracted_sent_2, original_title_2, clean=True))

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.315789,0.111111,0.342857
p,0.2,0.068966,0.222222
r,0.75,0.285714,0.75


Cleaned


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.333333,0.142857,0.357143
p,0.217391,0.090909,0.238095
r,0.714286,0.333333,0.714286


## CNN news 3

In [124]:
original_title_3 = "Vaccine hesitancy among Republicans emerges as Biden’s next big challenge"
extracted_sent_3 = "Johnson told Mckenna."

display(get_rouge_result(extracted_sent_3, original_title_3))
display(get_rouge_result(extracted_sent_3, original_title_3, clean=True))

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.0,0.0,0.0
p,0.0,0.0,0.0
r,0.0,0.0,0.0


Cleaned


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.0,0.0,0.0
p,0.0,0.0,0.0
r,0.0,0.0,0.0


--------------------------

# Gensim

## 함수 정의

In [125]:
def gensim_summarizer(url, language, word_count=25):
    news = Article(url, language=language)
    news.download()
    news.parse()
    original_length = len(news.text)
    summarized_text = summarize(news.text, word_count=word_count)
    summarized_length = len(summarized_text)
    return original_length, summarized_length, summarized_text

## 영어 - 스포츠

In [134]:
url_en = 'https://edition.cnn.com/2021/04/23/football/barcelona-remain-committed-to-european-super-league-spt-intl/index.html'
summarized_en = gensim_summarizer(url_en, 'en', word_count=50)
summarized_en

(1284,
 453,
 '"In this context, the FC Barcelona Board of Directors accepted, as a matter of immediate urgency, the offer to form part, as the founding member, of the Super League, a competition designed to improve the quality and attractiveness of the product offered to the football fans and, at the same time, and as one of FC Barcelona\'s most inalienable principles, seek new formulas for solidarity with the football family as a whole," Thursday\'s statement read')

### gensim

In [141]:
original_title_en = "Barcelona remains committed to Super League, saying it would be 'historical error' to pull out"

display(get_rouge_result(summarized_en[2], original_title_en))
display(get_rouge_result(summarized_en[2], original_title_en, clean=True))

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.111111,0.022727,0.119403
p,0.066667,0.013514,0.075472
r,0.333333,0.071429,0.285714


Cleaned


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.117647,0.040816,0.122449
p,0.073171,0.025,0.076923
r,0.3,0.111111,0.3


### 문서요약기

In [145]:
display(get_rouge_result(extracted_sent_1, original_title_1))
display(get_rouge_result(extracted_sent_1, original_title_1, clean=True))

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.25,0.052632,0.166667
p,0.2,0.041667,0.136364
r,0.333333,0.071429,0.214286


Cleaned


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.25,0.090909,0.166667
p,0.214286,0.076923,0.142857
r,0.3,0.111111,0.2


## 한국어 - 스포츠

In [136]:
url_kor = 'https://sports.news.naver.com/news.nhn?oid=109&aid=0004394893'
summarized_ko = gensim_summarizer(url_kor, 'ko')
summarized_ko

(1145,
 84,
 '1회 선두타자 카반 비지오가 볼넷으로 걸어나간 뒤 4회까지 노히터를 당했다.그러나 5회초 선두타자 마커스 세미엔이 우전 안타로 출루해 기회를 만들었다.')

In [137]:
original_title_ko = "류현진, 3⅔이닝 무실점→엉덩이 통증 강판…불펜 무실점 1-0 신승 [TOR 리뷰]"
get_rouge_result(summarized_ko[2], original_title_ko)

No Clean


Unnamed: 0,rouge-1,rouge-2,rouge-l
f,0.0,0.0,0.0
p,0.0,0.0,0.0
r,0.0,0.0,0.0


---------------------------