In [2]:
import json, os, re, networkx
from konlpy.utils import pprint
from ckonlpy.tag import Twitter
tw = Twitter()

In [3]:
class RawSentence: #문자열에 대한 처리
    def __init__(self, textIter):
        if type(textIter) == str: 
            self.textIter = textIter.split('\n')
        else: 
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield s


In [4]:
class RawSentenceReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield s


In [5]:
class RawTagger:
    def __init__(self, textIter, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Twitter
            self.tagger = Twitter()
        if type(textIter) == str:
            self.textIter = textIter.split('\n')
        else: 
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield self.tagger.pos(s)


In [6]:
class RawTaggerReader:
    def __init__(self, filepath, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Twitter
            self.tagger = Twitter()
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)


In [7]:
class TextRank:
    def __init__(self, **kargs):
        self.graph = None
        self.window = kargs.get('window', 5)
        self.coef = kargs.get('coef', 1.0)
        self.threshold = kargs.get('threshold', 0.005)
        self.dictCount = {}
        self.dictBiCount = {}
        self.dictNear = {}
        self.nTotal = 0
 
 
    def load(self, sentenceIter, wordFilter = None):
        def insertPair(a, b):
            if a > b: a, b = b, a
            elif a == b: return
            self.dictBiCount[a, b] = self.dictBiCount.get((a, b), 0) + 1
 
        def insertNearPair(a, b):
            self.dictNear[a, b] = self.dictNear.get((a, b), 0) + 1
 
        for sent in sentenceIter:
            for i, word in enumerate(sent):
                if wordFilter and not wordFilter(word): continue
                self.dictCount[word] = self.dictCount.get(word, 0) + 1
                self.nTotal += 1
                if i - 1 >= 0 and (not wordFilter or wordFilter(sent[i-1])): insertNearPair(sent[i-1], word)
                if i + 1 < len(sent) and (not wordFilter or wordFilter(sent[i+1])): insertNearPair(word, sent[i+1])
                for j in range(i+1, min(i+self.window+1, len(sent))):
                    if wordFilter and not wordFilter(sent[j]): continue
                    if sent[j] != word: insertPair(word, sent[j])
 
    def loadSents(self, sentenceIter, tokenizer = None):
        import math
        def similarity(a, b):
            n = len(a.intersection(b))
            return n / float(len(a) + len(b) - n) / (math.log(len(a)+1) * math.log(len(b)+1))
 
        if not tokenizer: rgxSplitter = re.compile('[\\s.,:;-?!()"\']+')
        sentSet = []
        for sent in filter(None, sentenceIter):
            if type(sent) == str:
                if tokenizer: s = set(filter(None, tokenizer(sent)))
                else: s = set(filter(None, rgxSplitter.split(sent)))
            else: s = set(sent)
            if len(s) < 2: continue
            self.dictCount[len(self.dictCount)] = sent
            sentSet.append(s)
 
        for i in range(len(self.dictCount)):
            for j in range(i+1, len(self.dictCount)):
                s = similarity(sentSet[i], sentSet[j])
                if s < self.threshold: continue
                self.dictBiCount[i, j] = s
 
    def getPMI(self, a, b):
        import math
        co = self.dictNear.get((a, b), 0)
        if not co: return None
        return math.log(float(co) * self.nTotal / self.dictCount[a] / self.dictCount[b])
 
    def getI(self, a):
        import math
        if a not in self.dictCount: return None
        return math.log(self.nTotal / self.dictCount[a])
 
    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictCount.keys())
        for (a, b), n in self.dictBiCount.items():
            self.graph.add_edge(a, b, weight = n*self.coef + (1-self.coef))
 
    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')
 
    def extract(self, ratio = 0.1):
        ranks = self.rank()
        cand = sorted(ranks, key=ranks.get, reverse=True)[:int(len(ranks) * ratio)]
        pairness = {}
        startOf = {}
        tuples = {}
        for k in cand:
            tuples[(k,)] = self.getI(k) * ranks[k]
            for l in cand:
                if k == l: continue
                pmi = self.getPMI(k, l)
                if pmi: pairness[k, l] = pmi
 
        for (k, l) in sorted(pairness, key=pairness.get, reverse=True):
            print(k[0], l[0], pairness[k, l])
            if k not in startOf: startOf[k] = (k, l)
 
        for (k, l), v in pairness.items():
            pmis = v
            rs = ranks[k] * ranks[l]
            path = (k, l)
            tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
            last = l
            while last in startOf and len(path) < 7:
                if last in path: break
                pmis += pairness[startOf[last]]
                last = startOf[last][1]
                rs *= ranks[last]
                path += (last,)
                tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
 
        used = set()
        both = {}
        for k in sorted(tuples, key=tuples.get, reverse=True):
            if used.intersection(set(k)): continue
            both[k] = tuples[k]
            for w in k: used.add(w)
 
        #for k in cand:
        #    if k not in used or True: both[k] = ranks[k] * self.getI(k)
 
        return both
 
    def summarize(self, ratio = 0.333):
        r = self.rank()
        ks = sorted(r, key=r.get, reverse=True)[:int(len(r)*ratio)]
        return ' '.join(map(lambda k:self.dictCount[k], sorted(ks)))


In [52]:
import json, os, re, sqlite3, io
import pandas as pd
from pandas import Series, DataFrame
from dateutil.parser import parse

In [31]:
con = sqlite3.connect("./data/data.db")
cursor = con.cursor()
con.text_factory = bytes
df = pd.read_sql("SELECT * FROM content", con, index_col=None)
df = df.iloc[1:-1]
corpus = df.to_dict('records')

In [33]:
def preprocessing(content):
    #content = re.sub('[a-zA-Z]', '', content)
    content = re.sub('/(<([^>])>)/ig', '', content)
    content = re.sub('\\t', '', content)
    content = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\")]', '', content)
    content = ' '.join(content.split())
    
    return content

In [34]:
def origin_tokenizing(content): #입력 문장 리스트를 토크나이징한다. Kkma는 pos로 분리하고, OS, NNG, NP만 빼내는 것이 좋다.
    featurelist = []
    tw = Twitter()
    for term in tw.pos(content):
        if term[1] == 'Noun':
            featurelist.append(term[0])
        elif term[1] == 'Alpha':
            featurelist.append(term[0])
        elif term[1] == 'Verb':
            featurelist.append(term[0])
    return featurelist

In [38]:
utf_corpus = []
for i, data in enumerate(corpus):
    try:
        dd = dict()
        dd['title'] = data['title'].decode('utf-8')
        dd['content'] = data['content'].decode('utf-8')
        dd['link'] = data['link'].decode('utf-8')
        dd['publish'] = data['publish'].decode('utf-8')
        dd['provider'] = data['provider'].decode('utf-8')
        dd['tokens'] = origin_tokenizing(preprocessing(dd['content']))
    except Exception as err:
        print(i, err)
    finally:
        utf_corpus.append(dd)

45 'NoneType' object has no attribute 'decode'
141 'NoneType' object has no attribute 'decode'
146 'NoneType' object has no attribute 'decode'
211 'NoneType' object has no attribute 'decode'


In [39]:
corpus[45]

{'content': b'',
 'link': b'https://www.cnet.co.kr/view/?no=20150513161050',
 'provider': b'cnet',
 'publish': None,
 'title': b''}

In [44]:
#del utf_corpus[45] #리스트 인덱스 삭제하는법
#del utf_corpus[141]
#del utf_corpus[146]
#del utf_corpus[211]

In [69]:
class Docu:
    def __init__(self, content, link, provider, publish, title, tokens):
        self.content = content
        self.link = link
        self.provider = provider
        self.publish = parse(publish)
        self.title = title
        self.tokens = tokens
    def __repr__(self):
        return repr((self.content, self.link, self.provider, self.publish, self.title, self.tokens))

In [83]:
docments = list()
for i, data in enumerate(utf_corpus):
    docments.append(Docu(data['content'], data['link'], data['provider'], data['publish'], data['title'], data['tokens']))

In [88]:
result = sorted(docments, key=lambda docu: docu.publish)

In [96]:
#정렬 확인
#for data in result:
#    print(data.publish.date())

2015-05-04
2015-05-04
2015-05-06
2015-05-06
2015-05-06
2015-05-06
2015-05-06
2015-05-06
2015-05-07
2015-05-07
2015-05-07
2015-05-08
2015-05-11
2015-05-12
2015-05-12
2015-05-12
2015-05-12
2015-05-12
2015-05-13
2015-05-14
2015-05-14
2015-05-14
2015-05-14
2015-05-15
2015-05-15
2015-05-15
2015-05-15
2015-05-18
2015-05-18
2015-05-18
2015-05-18
2015-05-19
2015-05-19
2015-05-19
2015-05-20
2015-05-20
2015-05-20
2015-05-20
2015-05-20
2015-05-20
2015-05-21
2015-05-21
2015-05-21
2015-05-22
2015-05-22
2015-05-22
2015-05-26
2015-05-26
2015-05-28
2015-05-28
2015-05-28
2015-05-28
2015-05-29
2015-05-29
2015-05-29
2015-05-29
2015-05-29
2015-06-01
2015-06-01
2015-06-01
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-03
2015-06-03
2015-06-03
2015-06-03
2015-06-03
2015-06-03
2015-06-04
2015-06-04
2015-06-04
2015-06-04
2015-06-05
2015-06-05
2015-06-05
2015-06-05
2015-06-08
2015-06-08
2015-06-08
2015-06-09
2015-06-09
2015-06-09
2015-06-09
2015-06-09
2015-06-09
2015-06-10

In [137]:
tr = TextRank(window=2, coef=1, threshold = 5)
print('Load...')
stopword = set([('있', 'Verb'), ('하', 'Verb'), ('되', 'Verb'), ('없', 'Verb')])
tr.load(RawTagger(result[0].content+result[1].content+result[2].content), lambda w: w not in stopword and (w[1] in ('Noun', 'Adjective', 'Alpha', 'Number')))
print('Build...')
tr.build()


Load...
Build...


In [151]:
kw  = tr.extract()
for k in sorted(kw, key=kw.get, reverse=True):
    print("%s\t%g" % (k, kw[k]))

사파이어 글래스 5.969985515430857
무선 충전 5.44673737166631
글래스 채택 5.341376856008484
핀 포트 5.159055299214529
5 개 5.118233304694274
더 사파이어 4.871373226762748
통해 무선 4.871373226762748
9 월 4.781761068073061
C 의 4.648229675448539
충전 기술 4.599439511279106
애플 워치 4.508467733073379
올해 4 4.465908118654584
의 배터리 4.465908118654584
제품 개발 4.465908118654584
9 이 4.425086124134329
4 월 4.311757438827326
6 S 4.298854033991417
이 제품 4.242764567340374
다만 새로운 4.088613887513116
수 없다 4.060443010546419
다만 올해 3.906292330719161
워치 스트랩 3.859772315084268
아이폰 6 3.8337968286810074
S 의 3.7727609380946383
다만 기술 3.7521416508919025
9 에서 3.7319389435743835
6 핀 3.692718230421102
의 출시 3.692718230421102
S 에서 3.5496173867804286
올해 9 3.5496173867804286
6 C 3.469574679106892
올해 출시 3.2872531223129378
다만 애플 3.2557047645780117
새로운 아이폰 3.231163655661894
월 출시 3.1331024424856793
이제 애플 2.8990298206392793
S 출시 2.5941059417529924
프로세서 아이폰 2.5380164751019487
6 월 2.439955261925734
아이폰 제품 2.3556949183079943
일 아이폰 2.3556949183079943
배터리 아이폰 2.3556949183

In [150]:
kw

{(('1', 'Number'),): 0.0274257236988455,
 (('5', 'Number'), ('개', 'Noun')): 0.0398761783281097,
 (('9', 'Number'), ('월', 'Noun')): 0.0781966104170813,
 (('C', 'Alpha'),): 0.025703497962421995,
 (('S', 'Alpha'), ('에서', 'Noun')): 0.058009753202706085,
 (('가게', 'Noun'),): 0.02216083285148943,
 (('가능성', 'Noun'),): 0.02419652590727236,
 (('개발', 'Noun'),): 0.01911858359821786,
 (('것', 'Noun'),): 0.0484105775456557,
 (('고소하', 'Adjective'),): 0.023021688921832036,
 (('그', 'Noun'),): 0.022599996581969675,
 (('기능', 'Noun'),): 0.018034205021545256,
 (('다만', 'Noun'), ('새로운', 'Adjective')): 0.0278851634192484,
 (('다포케쉑', 'Noun'),): 0.026172999445755928,
 (('더', 'Noun'),): 0.026360710288780204,
 (('때문', 'Noun'),): 0.018659753416453734,
 (('루머', 'Noun'),): 0.024651200485615293,
 (('배터리', 'Noun'),): 0.02100180042463482,
 (('사파이어', 'Noun'), ('글래스', 'Noun')): 0.05246149026829661,
 (('수', 'Noun'), ('없다', 'Adjective')): 0.04848791073835928,
 (('스트랩', 'Noun'),): 0.033344885908699225,
 (('시간', 'Noun'),): 0.