In [12]:
#https://bab2min.tistory.com/552
import networkx
import re
 
class RawSentence:
    def __init__(self, textIter):
        if type(textIter) == str: self.textIter = textIter.split('\n')
        else: self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield s
 
class RawSentenceReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: continue
                yield s
 
class RawTagger:
    def __init__(self, textIter, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Komoran
            self.tagger = Komoran()
        if type(textIter) == str: self.textIter = textIter.split('\n')
        else: self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
 
class RawTaggerReader:
    def __init__(self, filepath, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Komoran
            self.tagger = Komoran()
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)
 
class TextRank:
    def __init__(self, **kargs):
        self.graph = None
        self.window = kargs.get('window', 5)
        self.coef = kargs.get('coef', 1.0)
        self.threshold = kargs.get('threshold', 0.005)
        self.dictCount = {}
        self.dictBiCount = {}
        self.dictNear = {}
        self.nTotal = 0
 
 
    def load(self, sentenceIter, wordFilter = None):
        def insertPair(a, b):
            if a > b: a, b = b, a
            elif a == b: return
            self.dictBiCount[a, b] = self.dictBiCount.get((a, b), 0) + 1
 
        def insertNearPair(a, b):
            self.dictNear[a, b] = self.dictNear.get((a, b), 0) + 1
 
        for sent in sentenceIter:
            for i, word in enumerate(sent):
                if wordFilter and not wordFilter(word): continue
                self.dictCount[word] = self.dictCount.get(word, 0) + 1
                self.nTotal += 1
                if i - 1 >= 0 and (not wordFilter or wordFilter(sent[i-1])): insertNearPair(sent[i-1], word)
                if i + 1 < len(sent) and (not wordFilter or wordFilter(sent[i+1])): insertNearPair(word, sent[i+1])
                for j in range(i+1, min(i+self.window+1, len(sent))):
                    if wordFilter and not wordFilter(sent[j]): continue
                    if sent[j] != word: insertPair(word, sent[j])
 
    def loadSents(self, sentenceIter, tokenizer = None):
        import math
        def similarity(a, b):
            n = len(a.intersection(b))
            return n / float(len(a) + len(b) - n) / (math.log(len(a)+1) * math.log(len(b)+1))
 
        if not tokenizer: rgxSplitter = re.compile('[\\s.,:;-?!()"\']+')
        sentSet = []
        for sent in filter(None, sentenceIter):
            if type(sent) == str:
                if tokenizer: s = set(filter(None, tokenizer(sent)))
                else: s = set(filter(None, rgxSplitter.split(sent)))
            else: s = set(sent)
            if len(s) < 2: continue
            self.dictCount[len(self.dictCount)] = sent
            sentSet.append(s)
 
        for i in range(len(self.dictCount)):
            for j in range(i+1, len(self.dictCount)):
                s = similarity(sentSet[i], sentSet[j])
                if s < self.threshold: continue
                self.dictBiCount[i, j] = s
 
    def getPMI(self, a, b):
        import math
        co = self.dictNear.get((a, b), 0)
        if not co: return None
        return math.log(float(co) * self.nTotal / self.dictCount[a] / self.dictCount[b])
 
    def getI(self, a):
        import math
        if a not in self.dictCount: return None
        return math.log(self.nTotal / self.dictCount[a])
 
    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictCount.keys())
        for (a, b), n in self.dictBiCount.items():
            self.graph.add_edge(a, b, weight=n*self.coef + (1-self.coef))
 
    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')
 
    def extract(self, ratio = 0.1):
        ranks = self.rank()
        cand = sorted(ranks, key=ranks.get, reverse=True)[:int(len(ranks) * ratio)]
        pairness = {}
        startOf = {}
        tuples = {}
        for k in cand:
            tuples[(k,)] = self.getI(k) * ranks[k]
            for l in cand:
                if k == l: continue
                pmi = self.getPMI(k, l)
                if pmi: pairness[k, l] = pmi
 
        for (k, l) in sorted(pairness, key=pairness.get, reverse=True):
            #print(k[0], l[0], pairness[k, l])
            if k not in startOf: startOf[k] = (k, l)
 
        for (k, l), v in pairness.items():
            pmis = v
            rs = ranks[k] * ranks[l]
            path = (k, l)
            tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
            last = l
            while last in startOf and len(path) < 7:
                if last in path: break
                pmis += pairness[startOf[last]]
                last = startOf[last][1]
                rs *= ranks[last]
                path += (last,)
                tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
 
        used = set()
        both = {}
        for k in sorted(tuples, key=tuples.get, reverse=True):
            if used.intersection(set(k)): continue
            both[k] = tuples[k]
            for w in k: used.add(w)
 
        #for k in cand:
        #    if k not in used or True: both[k] = ranks[k] * self.getI(k)
 
        return both
 
    def summarize(self, ratio = 0.333):
        r = self.rank()
        ks = sorted(r, key=r.get, reverse=True)[:int(len(r)*ratio)]
        return ' '.join(map(lambda k:self.dictCount[k], sorted(ks)))

In [11]:
tr = TextRank(window=5, coef=1)
print('Load...')
stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV'), ('없', 'VV') ])
tr.load(RawTaggerReader('닥터최태수.txt'), lambda w: w not in stopword and (w[1] in ('NNG', 'NNP', 'VV', 'VA')))
print('Build...')
tr.build()
kw = tr.extract(0.1)
title="닥터최태수"
title_nonspace=title.replace(' ','')
count=0
for k in sorted(kw, key=kw.get, reverse=True):
    temp=("%s%g" % (k, kw[k])).split('0')[0]
    if 'VV'in temp:
        continue
    if 'VA'in temp:
        continue
        
    hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
    result = hangul.sub('', temp).replace('  ',' ') # 한글과 띄어쓰기를 제외한 모든 부분을 제거
    result_nonspace=result.replace(' ','')
    if len(result)<=2:
        continue
    if result in title:
        continue
    if result_nonspace in title:
        continue
    if result in title_nonspace:
        continue
    if result_nonspace in title_nonspace:
        continue
    if '리뷰' in result:
        continue
    if '소설' in result:
        continue
    if '소개' in result:
        continue
    print (result)
    count+=1
    if(count==5):
        break
    #hangul = re.compile('[^ ㄱ-ㅣ가-힣]+')
    #result = hangul.sub('', temp) # 한글과 띄어쓰기를 제외한 모든 부분을 제거
    #print (result)

    #
    #
    #print("%s%g" % (k, kw[k]))
    #print("%s\t%g" % (k, kw[k]))
    #% (k, kw[k])

Load...
Build...
응급 의료 
동성 의료원 
의사 
한국 병원 
다음 날 


In [14]:
tr = TextRank(window=5, coef=1)
print('Load...')
stopword = set([('있', 'VV'), ('하', 'VV'), ('되', 'VV'), ('없', 'VV') ])
tr.load(RawTaggerReader('닥터최태수.txt'), lambda w: w not in stopword and (w[1] in ('NNG', 'NNP')))
print('Build...')
tr.build()
kw = tr.extract(0.1)
for k in sorted(kw, key=kw.get, reverse=True):
    print("%s\t%g" % (k, kw[k]))

Load...
Build...
(('닥터', 'NNP'), ('최태수', 'NNP'))	0.140619
(('태수', 'NNP'),)	0.0827759
(('응급', 'NNP'), ('의료', 'NNP'))	0.0812925
(('의학', 'NNP'), ('소설', 'NNG'))	0.0750026
(('동성', 'NNG'), ('의료원', 'NNG'))	0.058908
(('한국', 'NNP'), ('의사', 'NNG'))	0.0462055
(('다음', 'NNG'), ('날', 'NNG'))	0.044866
(('석호', 'NNP'), ('작가', 'NNG'))	0.0410713
(('소개', 'NNG'), ('글', 'NNG'))	0.0399076
(('인턴', 'NNP'), ('레지던트', 'NNP'))	0.0395852
(('이', 'NNP'), ('세계', 'NNG'))	0.03857
(('간호사', 'NNP'), ('카슈미르', 'NNP'))	0.0374284
(('곡리', 'NNP'), ('사람', 'NNG'))	0.0360313
(('해외', 'NNG'),)	0.0347588
(('지식', 'NNG'),)	0.0342711
(('하차', 'NNG'), ('작품', 'NNG'))	0.0340872
(('병원', 'NNG'),)	0.0314249
(('제임스', 'NNP'),)	0.0312016
(('때', 'NNG'), ('카프레', 'NNP'))	0.0311674
(('인물', 'NNG'),)	0.0287055
(('환자', 'NNG'),)	0.0269754
(('반복', 'NNG'),)	0.0262148
(('책', 'NNG'),)	0.025769
(('주인공', 'NNG'),)	0.0252469
(('후', 'NNG'),)	0.0249263
(('게', 'NNG'),)	0.0232284
(('이야기', 'NNG'),)	0.0230664
(('권위', 'NNG'),)	0.0230172
(('웹', 'NNG'),)	0.0223112
(('상황',