In [1]:
import re
import numpy as np
import nltk
import collections
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.feature_extraction.text import CountVectorizer  

# BASIC IMPLEMENTATION

TF-IDF

In [2]:
def get_tfidf(sentences,stop_words):
    corpus = []
    for sent in sentences:
        sent_token = nltk.word_tokenize(sent)
        sent_list = [word for word in sent_token if word not in stop_words]
        sent_str = ' '.join(sent_list)
        corpus.append(sent_str)

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    return tfidf.toarray()

### Algorithms to Evaluate the Importance of Sentences

- The frequency of words

In [3]:
def weight_frequency(tfidf):
    weight = {}
    for i in range(len(tfidf)):
        weight[i] = np.sum(tfidf[i])

    # Normalization
    max_weight = max(weight.values())
    min_weight = min(weight.values())
    for key in weight.keys():
        x = weight[key]
        weight[key] = (x-min_weight)/(max_weight-min_weight)

    return weight

- Position in document

In [4]:
def weight_position(sentences):
    weight = {}
    n = len(sentences)
    for i in range(n):
        weight[i] = (n - i)/n
    return weight

- The similarity of sentences

In [5]:
def get_similarity(s1, s2):
    return np.sum(s1*s2)/(1e-6+(np.sqrt(np.sum(s1*s1))*np.sqrt(np.sum(s2*s2))))

def weight_similarity(tfidf):
    weight = collections.defaultdict(lambda :0)
    for i in range(len(tfidf)):
        score_i = 0
        for j in range(len(tfidf)):
            score_i += get_similarity(tfidf[i], tfidf[j])
        weight[i] = score_i

    # Normalization
    max_score = max(weight.values())
    min_score = min(weight.values())
    for k in weight.keys():
        x = weight[k]
        weight[k] = (x-min_score)/(max_score-min_score)

    return weight

Compute the composite score, then sort it in descending order (default: setting all of the weights as even).

In [6]:
def final_score(frequency, position, similarity, feature_weight = [1,1,1]):
    weight = collections.defaultdict(lambda :0)
    for k in frequency.keys():
        weight[k] = feature_weight[0]*frequency[k]+feature_weight[1]*position[k]+feature_weight[2]*similarity[k]

    return sorted(weight.items(), key = lambda x:x[1], reverse = True)

In [7]:
def get_summarization(text, stop_word, num_sentence, comp_ratio = 0.3):
    sentences = re.sub(r'\n',' ',text) 
    sentences = nltk.sent_tokenize(text)
    
    tfidf = get_tfidf(sentences, stop_word)
    frequency = weight_frequency(tfidf)
    position = weight_position(sentences)
    similarity = weight_similarity(tfidf)
    
    score = final_score(frequency, position, similarity, feature_weight = [1,1,1])
    
    if num_sentence is None:
        num_sentence = int(len(sentences)*comp_ratio)
    sent_id = sorted([sent[0] for sent in score[:num_sentence]])
    
    summary = []
    for id in sent_id:
        summary.append(sentences[id])

    return ' '.join(summary)

### EVALUATION

In [8]:
import os

article_dir = "./my_evalue/articles"
result_dir = "./my_evalue/basic"

stop_word = nltk.corpus.stopwords.words('english')

for id in range(11490):
    id = str(id)
    article_name = str(id).zfill(6) + "_article.txt"
    result_name = str(id).zfill(6) + "_result.txt"
    article_path = os.path.join(article_dir, article_name)
    result_path = os.path.join(result_dir, result_name)
    
    with open(article_path, "r") as f:
        article_doc = f.read().splitlines()
        current_article = " ".join(article_doc)
    
    result = get_summarization(current_article, stop_word, num_sentence = 3)
    
    with open(result_path,"w") as f:
        f.write(result)
        if int(id)%500==0:
            print("Write example ",result_name, ".")

Write example  000000_result.txt .
Write example  000500_result.txt .
Write example  001000_result.txt .
Write example  001500_result.txt .
Write example  002000_result.txt .
Write example  002500_result.txt .
Write example  003000_result.txt .
Write example  003500_result.txt .
Write example  004000_result.txt .
Write example  004500_result.txt .
Write example  005000_result.txt .
Write example  005500_result.txt .
Write example  006000_result.txt .
Write example  006500_result.txt .
Write example  007000_result.txt .
Write example  007500_result.txt .
Write example  008000_result.txt .
Write example  008500_result.txt .
Write example  009000_result.txt .
Write example  009500_result.txt .
Write example  010000_result.txt .
Write example  010500_result.txt .
Write example  011000_result.txt .


## Result by pyrouge:

---------------------------------------------  
1 ROUGE-1 Average_R: 0.31473 (95%-conf.int. 0.31261 - 0.31674)  
1 ROUGE-1 Average_P: 0.53473 (95%-conf.int. 0.53215 - 0.53747)  
1 ROUGE-1 Average_F: 0.38398 (95%-conf.int. 0.38207 - 0.38594)  

---------------------------------------------  
1 ROUGE-2 Average_R: 0.13426 (95%-conf.int. 0.13267 - 0.13581)  
1 ROUGE-2 Average_P: 0.22805 (95%-conf.int. 0.22538 - 0.23062)  
1 ROUGE-2 Average_F: 0.16363 (95%-conf.int. 0.16173 - 0.16540)  

---------------------------------------------  
1 ROUGE-3 Average_R: 0.07596 (95%-conf.int. 0.07452 - 0.07735)  
1 ROUGE-3 Average_P: 0.12769 (95%-conf.int. 0.12541 - 0.13003)  
1 ROUGE-3 Average_F: 0.09206 (95%-conf.int. 0.09045 - 0.09366)  

---------------------------------------------  
1 ROUGE-4 Average_R: 0.04972 (95%-conf.int. 0.04841 - 0.05096)  
1 ROUGE-4 Average_P: 0.08276 (95%-conf.int. 0.08070 - 0.08482)  
1 ROUGE-4 Average_F: 0.05991 (95%-conf.int. 0.05840 - 0.06139)  

---------------------------------------------  
1 ROUGE-L Average_R: 0.24827 (95%-conf.int. 0.24659 - 0.25000)  
1 ROUGE-L Average_P: 0.42276 (95%-conf.int. 0.42031 - 0.42537)  
1 ROUGE-L Average_F: 0.30315 (95%-conf.int. 0.30142 - 0.30493)  

---------------------------------------------  
1 ROUGE-W-1.2 Average_R: 0.07426 (95%-conf.int. 0.07370 - 0.07482)  
1 ROUGE-W-1.2 Average_P: 0.30838 (95%-conf.int. 0.30645 - 0.31043)  
1 ROUGE-W-1.2 Average_F: 0.11689 (95%-conf.int. 0.11610 - 0.11771)  

---------------------------------------------  
1 ROUGE-S* Average_R: 0.09381 (95%-conf.int. 0.09251 - 0.09503)  
1 ROUGE-S* Average_P: 0.25627 (95%-conf.int. 0.25362 - 0.25901)  
1 ROUGE-S* Average_F: 0.12408 (95%-conf.int. 0.12276 - 0.12535)  

---------------------------------------------  
1 ROUGE-SU* Average_R: 0.09889 (95%-conf.int. 0.09757 - 0.10012)  
1 ROUGE-SU* Average_P: 0.26783 (95%-conf.int. 0.26517 - 0.27058)  
1 ROUGE-SU* Average_F: 0.13084 (95%-conf.int. 0.12948 - 0.13213)