自然语言处理中有两种文本摘要生成方法：抽取式和抽象式，尽管抽象式摘要的表现更好，但开发相关算法需要复杂但深度学习技巧和语言模型，为了获得合理产出，抽象式摘要方法必须能够解决诸多自然语言处理问题，如自然语言生成，语义表征和推理排序。

* 第一步：将这段话转换成句子
* 第二步：文本处理，移除停止词，数字，标点符号以及句子中的其他特殊字符。句子成分的过滤有助于移除冗余和不重要的信息。
* 第三步：分词
* 第四步：评估单词的加权出现频率（用每个单词的出现频率除以这段话中出现最多次的单词的频率）
* 第五步：用相应的加权频率替代原句中的各个单词，然后计算总和

In [None]:
# 第一步：准备数据
import bs4 as BeautifulSoup
import urllib.request

# fetched_data = urllib3.request.urlopen('https://en.wikipedia.org/wiki/20th_century')
fetched_data = urllib.request.urlopen('https://en.wikipedia.org/wiki/20th_century')
article_read = fetched_data.read()
article_parse = BeautifulSoup.BeautifulSoup(article_read,'lxml')

paragraphs = article_parse.find_all('p')
article_content = ''
for p in paragraphs:
    article_content += p.text

In [None]:
# 第二步：处理数据
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # 将单词还原成词根形式的算法
from nltk.tokenize import  word_tokenize,sent_tokenize

In [None]:
def _create_dictionary_table(text_string) -> dict:  # 描述函数的返回类型
    # 移除停止词
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text_string)  # 对句子进行分词
    # 还原词根
    stem = PorterStemmer()
    # 为词频表创建词典
    frequency_table = dict()  # 创建词典
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1
    return frequency_table



# def _create_dictionary_table(text_string) -> dict:

#     # removing stop words
#     stop_words = set(stopwords.words("english"))

#     words = word_tokenize(text_string)

#     # reducing words to their root form
#     stem = PorterStemmer()

#     # creating dictionary for the word frequency table
#     frequency_table = dict()
#     for wd in words:
#         wd = stem.stem(wd)
#         if wd in stop_words:
#             continue
#         if wd in frequency_table:
#             frequency_table[wd] += 1
#         else:
#             frequency_table[wd] = 1

#     return frequency_table

In [None]:
frequency_table = _create_dictionary_table(article_content)

In [None]:
# 第三步：将文章分割成句子
from nltk.tokenize import word_tokenize,sent_tokenize
sentences = sent_tokenize(article_content)

In [None]:
# 第四步：确定句子的加权频率
def _calculate_sentence_scores(sentences,frequency_tabel) -> dict:
    sentence_weight = dict()
    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_tabel:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]] += frequency_tabel[word_weight]
                else:
                    sentence_weight[sentence[:7]] = frequency_tabel[word_weight]
        # 为了避免长句的分数必然高于短句，我们用每个句子的分数除以该句中的单词数
        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]] / sentence_wordcount_without_stop_words
    return sentence_weight

In [None]:
sentence_weight = _calculate_sentence_scores(sentences,frequency_table)

In [None]:
frequency_table

In [None]:
sentence_weight

In [None]:
# 第五步：计算句子阈值
def _calculate_average_score(sentence_weight) -> int:
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]
    average_score = (sum_values) / len(sentence_weight)
    return average_score

In [None]:
# 第六步：生成摘要
def _get_article_summary(sentences,sentence_weight,threshold):
    sentence_counter = 0
    article_summary = ''
    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1
    return article_summary

### 完整代码

In [None]:
# importing libraries
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
import bs4 as BeautifulSoup
import urllib.request

# fetching the content from the URL
fetched_data = urllib.request.urlopen(
    'https://en.wikipedia.org/wiki/20th_century')

article_read = fetched_data.read()

# parsing the URL content and storing in a variable
article_parsed = BeautifulSoup.BeautifulSoup(article_read, 'html.parser')

#returning <p> tags
paragraphs = article_parsed.find_all('p')

article_content = ''

# looping through the paragraphs and adding them to the variable
for p in paragraphs:
    article_content += p.text


def _create_dictionary_table(text_string) -> dict:

    # removing stop words
    stop_words = set(stopwords.words("english"))

    words = word_tokenize(text_string)

    # reducing words to their root form
    stem = PorterStemmer()

    # creating dictionary for the word frequency table
    frequency_table = dict()
    for wd in words:
        wd = stem.stem(wd)
        if wd in stop_words:
            continue
        if wd in frequency_table:
            frequency_table[wd] += 1
        else:
            frequency_table[wd] = 1

    return frequency_table


def _calculate_sentence_scores(sentences, frequency_table) -> dict:

    # algorithm for scoring a sentence by its words
    sentence_weight = dict()

    for sentence in sentences:
        sentence_wordcount = (len(word_tokenize(sentence)))
        sentence_wordcount_without_stop_words = 0
        for word_weight in frequency_table:
            if word_weight in sentence.lower():
                sentence_wordcount_without_stop_words += 1
                if sentence[:7] in sentence_weight:
                    sentence_weight[sentence[:7]
                                    ] += frequency_table[word_weight]
                else:
                    sentence_weight[sentence[:7]
                                    ] = frequency_table[word_weight]

        sentence_weight[sentence[:7]] = sentence_weight[sentence[:7]
                                                        ] / sentence_wordcount_without_stop_words

    return sentence_weight


def _calculate_average_score(sentence_weight) -> int:

    # calculating the average score for the sentences
    sum_values = 0
    for entry in sentence_weight:
        sum_values += sentence_weight[entry]

    # getting sentence average value from source text
    average_score = (sum_values / len(sentence_weight))

    return average_score


def _get_article_summary(sentences, sentence_weight, threshold):
    sentence_counter = 0
    article_summary = ''

    for sentence in sentences:
        if sentence[:7] in sentence_weight and sentence_weight[sentence[:7]] >= (threshold):
            article_summary += " " + sentence
            sentence_counter += 1

    return article_summary


def _run_article_summary(article):

    # creating a dictionary for the word frequency table
    frequency_table = _create_dictionary_table(article)

    # tokenizing the sentences
    sentences = sent_tokenize(article)

    # algorithm for scoring a sentence by its words
    sentence_scores = _calculate_sentence_scores(sentences, frequency_table)

    # getting the threshold
    threshold = _calculate_average_score(sentence_scores)

    # producing the summary
    article_summary = _get_article_summary(
        sentences, sentence_scores, 1.5 * threshold)

    return article_summary

In [None]:
summary_results = _run_article_summary(article_content)
print(summary_results)