In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import matplotlib.pyplot as plt
from numpy import dot
from numpy.linalg import norm
import itertools
from nltk.tokenize import sent_tokenize
import os
import nltk

nltk.download('punkt')

In [2]:
class SentenceTokenizer(object):
    def text2sentences(self, text):
        sentences = sent_tokenize(text)

        return sentences

In [3]:
class GraphMatrix(object):
    def __init__(self):
        self.tfidf = TfidfVectorizer()
        self.cnt_vec = CountVectorizer()

    def build_mat(self, sentence):
        tfidf_mat = self.tfidf.fit_transform(sentence).toarray()
        return tfidf_mat

In [4]:
class TextRank(object):
        
    def summarize(self, article):
        n_text = SentenceTokenizer().text2sentences(article)
        
        if len(n_text) <= 3:
            return 0
        
        mat = GraphMatrix().build_mat(n_text)
        
        def co_sim(n1, n2):
            A = mat[n1]
            B = mat[n2]
            div = norm(A)*norm(B)
            if (div == 0):
                return 0
            return dot(A, B)/div

        graph = nx.Graph()
        
        for idx in range(len(n_text)):
            graph.add_node(idx)
        
        pairs = list(itertools.combinations(range(len(n_text)), 2))
    
        for n1, n2 in pairs:
            sim = co_sim(n1, n2)
            if sim != 0:
                graph.add_edge(n1, n2, weight=sim)

        nx.draw_networkx(graph)
        plt.show()
    
        pagerank = nx.pagerank(graph, weight='weight')
        reordered = sorted(pagerank, key=pagerank.get, reverse=False)
        #출력될 문장의 수가 n개인 경우 reordered[:-n-1]로 수정하면 됨 (default 15문장)
        reordered = reordered[:-16]
        #print(pagerank)
        #print(reordered)
        
        ex_sum = []

        for n in range(0, len(n_text)):
            if n in reordered:
                continue
            ex_sum.append(n_text[n])
        
        graph.clear()

        return ex_sum

In [None]:
# -*- encoding: utf-8 -*-

tr = TextRank()

#파일 이름들이 엉망이라 해당 폴더의 파일리스트를 긁어와서 대입하는 방향으로 진행
path = './data' #파일들이 있는 폴더를 path에 지정
file_list = os.listdir(path)
filelist = [fi for fi in file_list if fi.endswith(".story")] #해당 path안 파일리스트 가져온 후 story 파일만 거름

for filen in filelist:
    with open(f'./data/{filen}', 'r', encoding = 'UTF-8') as file: #fstring 문법 헷갈리면 파이썬 fstring으로 검색
        te = file.read()
        text = te.split('@highlight')[0] #이게 본문 전체
        hight = te.split('@highlight')[1:] #이게 highlight 문장들을 리스트로, 헷갈리면 print해보는것 추천

        storyList = []
        storyList = text.split("\n\n")
        story = " ".join(storyList)

        if filelist.index(filen)<len(filelist)//10:
            with open(f'./data/test_story.txt', 'a', encoding = 'UTF-8') as filew:
                # story.txt에 extractive + "\n"제거한 문장 append
                sentences = sent_tokenize(story) #문단을 문장 리스트로 토크나이즈
                oritext = " ".join(sentences)
                if(len(sentences) > 15): #문장의 갯수가 n보다 클 경우 현재 n=15
                    ext = tr.summarize(oritext) #summarize
                    ext = " ".join(ext)
                    # ext.replace("\n", " ")
                    filew.write(ext)
                else:
                    filew.write(oritext)
                filew.write("\n")
            
                
        elif filelist.index(filen)<2*len(filelist)//10:
            with open(f'./data/eval_story.txt', 'a', encoding = 'UTF-8') as filew:
                # story.txt에 extractive + "\n"제거한 문장 append
                sentences = sent_tokenize(story) #문단을 문장 리스트로 토크나이즈
                oritext = " ".join(sentences)
                if(len(sentences) > 15): #문장의 갯수가 n보다 클 경우 현재 n=15
                    ext = tr.summarize(oritext) #summarize
                    ext = " ".join(ext)
                    # ext.replace("\n", " ")
                    filew.write(ext)

                else:
                    filew.write(oritext)
                filew.write("\n")
            
                
        else:
            with open(f'./data/train_story.txt', 'a', encoding = 'UTF-8') as filew:
                # story.txt에 extractive + "\n"제거한 문장 append
                sentences = sent_tokenize(story) #문단을 문장 리스트로 토크나이즈
                oritext = " ".join(sentences)
                if(len(sentences) > 15): #문장의 갯수가 n보다 클 경우 현재 n=15
                    ext = tr.summarize(oritext) #summarize
                    ext = " ".join(ext)
                    # ext.replace("\n", " ")
                    filew.write(ext)

                else:
                    filew.write(oritext)
                filew.write("\n")
                
        
                    

        #summary.txt에 "\n"제거한 문장 append
        if filelist.index(filen)<len(filelist)//10:
            highlight=[]
            with open(f'./data/test_summ.txt', 'a', encoding = 'UTF-8') as filew:
                for line in hight:
                    highlight.append(line.replace("\n\n", ""))
                realH = ". ".join(highlight)
                filew.write(realH)
                filew.write(".\n")
                
        elif filelist.index(filen)<2*len(filelist)//10:
            highlight=[]
            with open(f'./data/eval_summ.txt', 'a', encoding = 'UTF-8') as filew:
                for line in hight:
                    highlight.append(line.replace("\n\n", ""))
                realH = ". ".join(highlight)
                filew.write(realH)
                filew.write(".\n")
                
        else:
            highlight=[]
            with open(f'./data/train_summ.txt', 'a', encoding = 'UTF-8') as filew:
                for line in hight:
                    highlight.append(line.replace("\n\n", ""))
                realH = ". ".join(highlight)
                filew.write(realH)
                filew.write(".\n")
