In [2]:
import json

with open('dataset_43428_1.txt', encoding="utf-8") as json_file:
    df = json.load(json_file)

In [3]:
from itertools import combinations
from nltk.tokenize import sent_tokenize, RegexpTokenizer
from nltk.stem.snowball import RussianStemmer
import networkx as nx

def similarity(s1, s2):
    if not len(s1) or not len(s2):
        return 0.0
    return len(s1.intersection(s2))/(1.0 * (len(s1) + len(s2)))

def textrank(text):
    sentences = sent_tokenize(text)
    tokenizer = RegexpTokenizer(r'\w+')
    lmtzr = RussianStemmer()
    words = [set(lmtzr.stem(word) for word in tokenizer.tokenize(sentence.lower()))
             for sentence in sentences]

    pairs = combinations(range(len(sentences)), 2)
    scores = [(i, j, similarity(words[i], words[j])) for i, j in pairs]
    scores = filter(lambda x: x[2], scores)

    g = nx.Graph()
    g.add_weighted_edges_from(scores)
    pr = nx.pagerank(g)

    return sorted(((i, pr[i], s) for i, s in enumerate(sentences) if i in pr),
                  key=lambda x: pr[x[0]], reverse=True)

def extract(text, n=3):
    tr = textrank(text)
    top_n = sorted(tr[:n])
    return ' '.join(x[2] for x in top_n)

In [4]:
summarized_text = [extract(text) for text in df]


In [5]:
with open('outdataset_43428_1.txt', 'w', encoding='utf-8') as f:
    json.dump(summarized_text, f, ensure_ascii=False, indent=4)

In [7]:
from rouge import Rouge 

rouge = Rouge()

for i in range(len(summarized_text)):
    if len(summarized_text[i]) <= 0 or len(df[i]) <= 0:
        continue
    scores = rouge.get_scores(df[i][:300], summarized_text[i])
    print("Scores ", i, ": ", scores, "\n ____________________________")

Scores  0 :  [{'rouge-1': {'f': 0.4385964864496768, 'p': 0.5555555555555556, 'r': 0.36231884057971014}, 'rouge-2': {'f': 0.3749999952295919, 'p': 0.4772727272727273, 'r': 0.3088235294117647}, 'rouge-l': {'f': 0.4554455396647388, 'p': 0.5348837209302325, 'r': 0.39655172413793105}}] 
 ____________________________
Scores  1 :  [{'rouge-1': {'f': 0.17821781680619558, 'p': 0.19148936170212766, 'r': 0.16666666666666666}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.11627906477014624, 'p': 0.11904761904761904, 'r': 0.11363636363636363}}] 
 ____________________________
Scores  2 :  [{'rouge-1': {'f': 0.4144144098563428, 'p': 0.5897435897435898, 'r': 0.3194444444444444}, 'rouge-2': {'f': 0.34862384866930396, 'p': 0.5, 'r': 0.2676056338028169}, 'rouge-l': {'f': 0.455445539860798, 'p': 0.6052631578947368, 'r': 0.36507936507936506}}] 
 ____________________________
Scores  3 :  [{'rouge-1': {'f': 0.4819277058615183, 'p': 0.5128205128205128, 'r': 0.45454545454545453}, 'rouge-2': {'f

Scores  48 :  [{'rouge-1': {'f': 0.5636363591719008, 'p': 0.8378378378378378, 'r': 0.4246575342465753}, 'rouge-2': {'f': 0.5185185140740741, 'p': 0.7777777777777778, 'r': 0.3888888888888889}, 'rouge-l': {'f': 0.5934065887839632, 'p': 0.8181818181818182, 'r': 0.46551724137931033}}] 
 ____________________________
Scores  49 :  [{'rouge-1': {'f': 0.5109489006489425, 'p': 0.7608695652173914, 'r': 0.38461538461538464}, 'rouge-2': {'f': 0.44444444000000005, 'p': 0.6666666666666666, 'r': 0.3333333333333333}, 'rouge-l': {'f': 0.5048543643284005, 'p': 0.7027027027027027, 'r': 0.3939393939393939}}] 
 ____________________________
Scores  50 :  [{'rouge-1': {'f': 0.5523809473922903, 'p': 0.5272727272727272, 'r': 0.58}, 'rouge-2': {'f': 0.4660194124875106, 'p': 0.4444444444444444, 'r': 0.4897959183673469}, 'rouge-l': {'f': 0.5494505444511534, 'p': 0.5434782608695652, 'r': 0.5555555555555556}}] 
 ____________________________
Scores  51 :  [{'rouge-1': {'f': 0.23529411285232188, 'p': 0.29508196721311

Scores  81 :  [{'rouge-1': {'f': 0.22535210807280312, 'p': 0.3137254901960784, 'r': 0.17582417582417584}, 'rouge-2': {'f': 0.057142852551020785, 'p': 0.08, 'r': 0.044444444444444446}, 'rouge-l': {'f': 0.19469026069386808, 'p': 0.24444444444444444, 'r': 0.16176470588235295}}] 
 ____________________________
Scores  82 :  [{'rouge-1': {'f': 0.1984732776551484, 'p': 0.25, 'r': 0.16455696202531644}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.11881187629840231, 'p': 0.13953488372093023, 'r': 0.10344827586206896}}] 
 ____________________________
Scores  83 :  [{'rouge-1': {'f': 0.0733944907196367, 'p': 0.0975609756097561, 'r': 0.058823529411764705}, 'rouge-2': {'f': 0.0, 'p': 0.0, 'r': 0.0}, 'rouge-l': {'f': 0.06521738649574706, 'p': 0.08108108108108109, 'r': 0.05454545454545454}}] 
 ____________________________
Scores  85 :  [{'rouge-1': {'f': 0.15999999503200013, 'p': 0.17391304347826086, 'r': 0.14814814814814814}, 'rouge-2': {'f': 0.06122448482923822, 'p': 0.066666666666

Scores  132 :  [{'rouge-1': {'f': 0.12389380055446807, 'p': 0.1590909090909091, 'r': 0.10144927536231885}, 'rouge-2': {'f': 0.036036031289668674, 'p': 0.046511627906976744, 'r': 0.029411764705882353}, 'rouge-l': {'f': 0.09302325098702027, 'p': 0.11428571428571428, 'r': 0.0784313725490196}}] 
 ____________________________
Scores  133 :  [{'rouge-1': {'f': 0.46478872791906384, 'p': 0.6875, 'r': 0.35106382978723405}, 'rouge-2': {'f': 0.37142856696836735, 'p': 0.5531914893617021, 'r': 0.27956989247311825}, 'rouge-l': {'f': 0.48275861602407855, 'p': 0.6511627906976745, 'r': 0.3835616438356164}}] 
 ____________________________
Scores  134 :  [{'rouge-1': {'f': 0.2206896511771701, 'p': 0.4, 'r': 0.1523809523809524}, 'rouge-2': {'f': 0.12587412190718386, 'p': 0.23076923076923078, 'r': 0.08653846153846154}, 'rouge-l': {'f': 0.23529411307958487, 'p': 0.3333333333333333, 'r': 0.18181818181818182}}] 
 ____________________________
Scores  135 :  [{'rouge-1': {'f': 0.1889763732506666, 'p': 0.25, 'r'

Scores  176 :  [{'rouge-1': {'f': 0.5765765719665612, 'p': 0.8, 'r': 0.4507042253521127}, 'rouge-2': {'f': 0.5137614632943356, 'p': 0.717948717948718, 'r': 0.4}, 'rouge-l': {'f': 0.5999999952987655, 'p': 0.7941176470588235, 'r': 0.48214285714285715}}] 
 ____________________________
Scores  177 :  [{'rouge-1': {'f': 0.799999995162, 'p': 0.975609756097561, 'r': 0.6779661016949152}, 'rouge-2': {'f': 0.7959183625156185, 'p': 0.975, 'r': 0.6724137931034483}, 'rouge-l': {'f': 0.8157894688088644, 'p': 0.96875, 'r': 0.7045454545454546}}] 
 ____________________________
Scores  178 :  [{'rouge-1': {'f': 0.2999999950020001, 'p': 0.30612244897959184, 'r': 0.29411764705882354}, 'rouge-2': {'f': 0.10204081132861334, 'p': 0.10416666666666667, 'r': 0.1}, 'rouge-l': {'f': 0.3043478210893195, 'p': 0.3111111111111111, 'r': 0.2978723404255319}}] 
 ____________________________
Scores  179 :  [{'rouge-1': {'f': 0.10389609998903708, 'p': 0.1951219512195122, 'r': 0.07079646017699115}, 'rouge-2': {'f': 0.0, 'p