In [2]:
import json, os, re, networkx
from konlpy.utils import pprint
from ckonlpy.tag import Twitter
tw = Twitter()

In [3]:
class RawSentence: #문자열에 대한 처리
    def __init__(self, textIter):
        if type(textIter) == str: 
            self.textIter = textIter.split('\n')
        else: 
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield s


In [4]:
class RawSentenceReader:
    def __init__(self, filepath):
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a, b: a + b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield s


In [5]:
class RawTagger:
    def __init__(self, textIter, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Twitter
            self.tagger = Twitter()
        if type(textIter) == str:
            self.textIter = textIter.split('\n')
        else: 
            self.textIter = textIter
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in self.textIter:
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
                if not s: 
                    continue
                yield self.tagger.pos(s)


In [6]:
class RawTaggerReader:
    def __init__(self, filepath, tagger = None):
        if tagger:
            self.tagger = tagger
        else :
            from konlpy.tag import Twitter
            self.tagger = Twitter()
        self.filepath = filepath
        self.rgxSplitter = re.compile('([.!?:](?:["\']|(?![0-9])))')
 
    def __iter__(self):
        for line in open(self.filepath, encoding='utf-8'):
            ch = self.rgxSplitter.split(line)
            for s in map(lambda a,b:a+b, ch[::2], ch[1::2]):
                if not s: continue
                yield self.tagger.pos(s)


In [306]:
class TextRank:
    def __init__(self, **kargs):
        self.graph = None
        self.window = kargs.get('window', 5)
        self.coef = kargs.get('coef', 1.0)
        self.threshold = kargs.get('threshold', 0.005)
        self.dictCount = {}
        self.dictBiCount = {}
        self.dictNear = {}
        self.nTotal = 0
 
 
    def load(self, sentenceIter, wordFilter = None):
        def insertPair(a, b):
            if a > b: a, b = b, a
            elif a == b: return
            self.dictBiCount[a, b] = self.dictBiCount.get((a, b), 0) + 1
 
        def insertNearPair(a, b):
            self.dictNear[a, b] = self.dictNear.get((a, b), 0) + 1
 
        for sent in sentenceIter:
            for i, word in enumerate(sent):
                if wordFilter and not wordFilter(word): continue
                self.dictCount[word] = self.dictCount.get(word, 0) + 1
                self.nTotal += 1
                if i - 1 >= 0 and (not wordFilter or wordFilter(sent[i-1])): insertNearPair(sent[i-1], word)
                if i + 1 < len(sent) and (not wordFilter or wordFilter(sent[i+1])): insertNearPair(word, sent[i+1])
                for j in range(i+1, min(i+self.window+1, len(sent))):
                    if wordFilter and not wordFilter(sent[j]): continue
                    if sent[j] != word: insertPair(word, sent[j])
 
    def loadSents(self, sentenceIter, tokenizer = None):
        import math
        def similarity(a, b):
            n = len(a.intersection(b))
            return n / float(len(a) + len(b) - n) / (math.log(len(a)+1) * math.log(len(b)+1))
 
        if not tokenizer: rgxSplitter = re.compile('[\\s.,:;-?!()"\']+')
        sentSet = []
        for sent in filter(None, sentenceIter):
            if type(sent) == str:
                if tokenizer: s = set(filter(None, tokenizer(sent)))
                else: s = set(filter(None, rgxSplitter.split(sent)))
            else: s = set(sent)
            if len(s) < 2: continue
            self.dictCount[len(self.dictCount)] = sent
            sentSet.append(s)
 
        for i in range(len(self.dictCount)):
            for j in range(i+1, len(self.dictCount)):
                s = similarity(sentSet[i], sentSet[j])
                if s < self.threshold: continue
                self.dictBiCount[i, j] = s
 
    def getPMI(self, a, b):
        import math
        co = self.dictNear.get((a, b), 0)
        if not co: return None
        return math.log(float(co) * self.nTotal / self.dictCount[a] / self.dictCount[b])
 
    def getI(self, a):
        import math
        if a not in self.dictCount: return None
        return math.log(self.nTotal / self.dictCount[a])
 
    def build(self):
        self.graph = networkx.Graph()
        self.graph.add_nodes_from(self.dictCount.keys())
        for (a, b), n in self.dictBiCount.items():
            self.graph.add_edge(a, b, weight = n*self.coef + (1-self.coef))
 
    def rank(self):
        return networkx.pagerank(self.graph, weight='weight')
 
    def extract(self, ratio = 0.1):
        ranks = self.rank()
        cand = sorted(ranks, key=ranks.get, reverse=True)[:int(len(ranks) * ratio)]
        pairness = {}
        startOf = {}
        tuples = {}
        complex_pmi = []
        for k in cand:
            tuples[(k,)] = self.getI(k) * ranks[k]
            for l in cand:
                if k == l: continue
                pmi = self.getPMI(k, l)
                if pmi: pairness[k, l] = pmi
 
        for (k, l) in sorted(pairness, key=pairness.get, reverse=True):
            temp = (k[0]+l[0], pairness[k, l])
            complex_pmi.append(temp)
            if k not in startOf: startOf[k] = (k, l)
        return complex_pmi

        '''for (k, l), v in pairness.items():
            pmis = v
            rs = ranks[k] * ranks[l]
            path = (k, l)
            tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
            last = l
            while last in startOf and len(path) < 7:
                if last in path: break
                pmis += pairness[startOf[last]]
                last = startOf[last][1]
                rs *= ranks[last]
                path += (last,)
                tuples[path] = pmis / (len(path) - 1) * rs ** (1 / len(path)) * len(path)
 
        used = set()
        both = {}
        for k in sorted(tuples, key=tuples.get, reverse=True):
            if used.intersection(set(k)): continue
            both[k] = tuples[k]
            for w in k: used.add(w)
 
        #for k in cand:
        #    if k not in used or True: both[k] = ranks[k] * self.getI(k)
 
        return both'''
 
    def summarize(self, ratio = 0.333):
        r = self.rank()
        ks = sorted(r, key=r.get, reverse=True)[:int(len(r)*ratio)]
        return ' '.join(map(lambda k:self.dictCount[k], sorted(ks)))


In [52]:
import json, os, re, sqlite3, io
import pandas as pd
from pandas import Series, DataFrame
from dateutil.parser import parse

In [31]:
con = sqlite3.connect("./data/data.db")
cursor = con.cursor()
con.text_factory = bytes
df = pd.read_sql("SELECT * FROM content", con, index_col=None)
df = df.iloc[1:-1]
corpus = df.to_dict('records')

In [33]:
def preprocessing(content):
    #content = re.sub('[a-zA-Z]', '', content)
    content = re.sub('/(<([^>])>)/ig', '', content)
    content = re.sub('\\t', '', content)
    content = re.sub('[\{\}\[\]\/?.,;:|\)*~`!^\-_+<>@\#$%&\\\=\(\'\")]', '', content)
    content = ' '.join(content.split())
    
    return content

In [34]:
def origin_tokenizing(content): #입력 문장 리스트를 토크나이징한다. Kkma는 pos로 분리하고, OS, NNG, NP만 빼내는 것이 좋다.
    featurelist = []
    tw = Twitter()
    for term in tw.pos(content):
        if term[1] == 'Noun':
            featurelist.append(term[0])
        elif term[1] == 'Alpha':
            featurelist.append(term[0])
        elif term[1] == 'Verb':
            featurelist.append(term[0])
    return featurelist

In [38]:
utf_corpus = []
for i, data in enumerate(corpus):
    try:
        dd = dict()
        dd['title'] = data['title'].decode('utf-8')
        dd['content'] = data['content'].decode('utf-8')
        dd['link'] = data['link'].decode('utf-8')
        dd['publish'] = data['publish'].decode('utf-8')
        dd['provider'] = data['provider'].decode('utf-8')
        dd['tokens'] = origin_tokenizing(preprocessing(dd['content']))
    except Exception as err:
        print(i, err)
    finally:
        utf_corpus.append(dd)

45 'NoneType' object has no attribute 'decode'
141 'NoneType' object has no attribute 'decode'
146 'NoneType' object has no attribute 'decode'
211 'NoneType' object has no attribute 'decode'


In [39]:
corpus[45]

{'content': b'',
 'link': b'https://www.cnet.co.kr/view/?no=20150513161050',
 'provider': b'cnet',
 'publish': None,
 'title': b''}

In [44]:
#del utf_corpus[45] #리스트 인덱스 삭제하는법
#del utf_corpus[141]
#del utf_corpus[146]
#del utf_corpus[211]

In [69]:
class Docu:
    def __init__(self, content, link, provider, publish, title, tokens):
        self.content = content
        self.link = link
        self.provider = provider
        self.publish = parse(publish)
        self.title = title
        self.tokens = tokens
    def __repr__(self):
        return repr((self.content, self.link, self.provider, self.publish, self.title, self.tokens))

In [83]:
docments = list()
for i, data in enumerate(utf_corpus):
    docments.append(Docu(data['content'], data['link'], data['provider'], data['publish'], data['title'], data['tokens']))

In [88]:
result = sorted(docments, key=lambda docu: docu.publish)

In [96]:
#정렬 확인
#for data in result:
#    print(data.publish.date())

2015-05-04
2015-05-04
2015-05-06
2015-05-06
2015-05-06
2015-05-06
2015-05-06
2015-05-06
2015-05-07
2015-05-07
2015-05-07
2015-05-08
2015-05-11
2015-05-12
2015-05-12
2015-05-12
2015-05-12
2015-05-12
2015-05-13
2015-05-14
2015-05-14
2015-05-14
2015-05-14
2015-05-15
2015-05-15
2015-05-15
2015-05-15
2015-05-18
2015-05-18
2015-05-18
2015-05-18
2015-05-19
2015-05-19
2015-05-19
2015-05-20
2015-05-20
2015-05-20
2015-05-20
2015-05-20
2015-05-20
2015-05-21
2015-05-21
2015-05-21
2015-05-22
2015-05-22
2015-05-22
2015-05-26
2015-05-26
2015-05-28
2015-05-28
2015-05-28
2015-05-28
2015-05-29
2015-05-29
2015-05-29
2015-05-29
2015-05-29
2015-06-01
2015-06-01
2015-06-01
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-02
2015-06-03
2015-06-03
2015-06-03
2015-06-03
2015-06-03
2015-06-03
2015-06-04
2015-06-04
2015-06-04
2015-06-04
2015-06-05
2015-06-05
2015-06-05
2015-06-05
2015-06-08
2015-06-08
2015-06-08
2015-06-09
2015-06-09
2015-06-09
2015-06-09
2015-06-09
2015-06-09
2015-06-10

In [249]:
predata = result[0]
datecorpus = {}
datecorpus[str(predata.publish.date())] = {'string' : predata.content + "\n"}

for i in range(1, len(result)):
    if result[i].publish.date() == predata.publish.date():
        try:
            datecorpus[str(result[i].publish.date())]['string'] += str(result[i].content + "\n")
            #print(result[i].publish.date())
        except KeyError as e:
            datecorpus[str(result[i].publish.date())] = {'string' : str(result[i].content + "\n")}
            #print(e)
    predata = result[i]
    

In [310]:
stopword = set([('있', 'Verb'), ('하', 'Verb'), ('되', 'Verb'), ('없', 'Verb')])
tr = TextRank(window=1, coef=1, threshold = 5)
for date in list(datecorpus.keys()):
    %time tr.load(RawTagger(datecorpus[date]['string']), lambda w: w not in stopword and (w[1] in ('Noun', 'Adjective', 'Alpha', 'Number')))
    tr.build()
    datecorpus[date]['pmi'] = tr.extract(0.4)

Wall time: 58 ms
Wall time: 72.4 ms
Wall time: 31.2 ms
Wall time: 74.9 ms
Wall time: 48.1 ms
Wall time: 45.6 ms
Wall time: 70.4 ms
Wall time: 25.3 ms
Wall time: 77.4 ms
Wall time: 19.3 ms
Wall time: 26.8 ms
Wall time: 11.9 ms
Wall time: 33.2 ms
Wall time: 80.3 ms
Wall time: 44.7 ms
Wall time: 117 ms
Wall time: 67 ms
Wall time: 50.1 ms
Wall time: 25.3 ms
Wall time: 54.6 ms
Wall time: 221 ms
Wall time: 33.7 ms
Wall time: 128 ms
Wall time: 45.7 ms
Wall time: 113 ms
Wall time: 38.2 ms
Wall time: 66 ms
Wall time: 20.3 ms
Wall time: 64.5 ms
Wall time: 20.4 ms
Wall time: 35.2 ms
Wall time: 109 ms
Wall time: 29.7 ms
Wall time: 83.3 ms
Wall time: 97.7 ms
Wall time: 9.41 ms
Wall time: 47.6 ms
Wall time: 41.6 ms
Wall time: 53.1 ms
Wall time: 32.7 ms
Wall time: 54.1 ms
Wall time: 50.1 ms
Wall time: 66 ms
Wall time: 166 ms
Wall time: 25.8 ms
Wall time: 75.9 ms
Wall time: 55.1 ms
Wall time: 64.5 ms
Wall time: 77.9 ms
Wall time: 129 ms
Wall time: 92.3 ms
Wall time: 39.2 ms
Wall time: 89.8 ms
Wall tim

In [337]:
#다시 짜자 좀 쉬고...
a = list(datecorpus.keys())
predate = datecorpus[a[0]]
for i in range(1, len(a)):
    for c_word in datecorpus[a[i]]['pmi']:
        print(type(c_word))
        for pre_c_word in predate['pmi']:
            if str(pre_c_word[0]) != str(c_word[0]):
                datecorpus[a[i]]['pmi'].append(pre_c_word)
                type(pre_c_word)
    predate = datecorpus[a[i]]
        

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class 'tuple'>
<class '

TypeError: 'float' object is not subscriptable

In [338]:
datecorpus[a[i]]['pmi']

[('투파이브', 7.777373602657861),
 ('캔슬링', 7.777373602657861),
 ('Hz이하', 7.777373602657861),
 ('북부연방', 7.777373602657861),
 ('연방지방법원', 7.777373602657861),
 ('지방법원배심', 7.777373602657861),
 ('쿡CEO', 7.777373602657861),
 ('가구및', 7.777373602657861),
 ('용데이터', 7.777373602657861),
 ('mm단위', 7.777373602657861),
 ('리더십아래', 7.777373602657861),
 ('및주방', 7.777373602657861),
 ('즈역시', 7.777373602657861),
 ('데이터송수', 7.777373602657861),
 ('새로운카테고리', 7.777373602657861),
 ('아직출시일', 7.777373602657861),
 ('생참치', 7.777373602657861),
 ('이면든든한', 7.777373602657861),
 ('끼식사', 7.777373602657861),
 ('무조건포장', 7.777373602657861),
 ('곧수백', 7.777373602657861),
 ('입소문', 7.777373602657861),
 ('빅아일랜드', 7.777373602657861),
 ('불만글', 7.777373602657861),
 ('다른레스토랑', 7.777373602657861),
 ('자연친', 7.777373602657861),
 ('라운드프리', 7.777373602657861),
 ('각종음식', 7.777373602657861),
 ('조명도마련', 7.777373602657861),
 ('보증기간', 7.777373602657861),
 ('정확히측정', 7.777373602657861),
 ('대한설명', 7.777373602657861),
 ('거나새롭', 7.777373602657861),
 (

In [325]:
datecorpus['2015-05-04']['pmi']

[('투파이브', 6.677083461247136),
 ('600명', 6.677083461247136),
 ('쿡CEO', 6.677083461247136),
 ('용데이터', 6.677083461247136),
 ('리더십아래', 6.677083461247136),
 ('따라서선', 6.677083461247136),
 ('사전주문', 6.677083461247136),
 ('모든기능', 6.677083461247136),
 ('새로운카테고리', 6.677083461247136),
 ('아직출시일', 6.677083461247136),
 ('생참치', 6.677083461247136),
 ('이면든든한', 6.677083461247136),
 ('끼식사', 6.677083461247136),
 ('무조건포장', 6.677083461247136),
 ('어떠한서비스', 6.677083461247136),
 ('곧수백', 6.677083461247136),
 ('입소문', 6.677083461247136),
 ('광고비용', 6.677083461247136),
 ('빅아일랜드', 6.677083461247136),
 ('홍보효과', 6.677083461247136),
 ('의리더십', 5.983936280687191),
 ('손님중', 5.983936280687191),
 ('등주요', 5.983936280687191),
 ('50접시', 5.983936280687191),
 ('CEO의', 5.983936280687191),
 ('리버즈', 5.983936280687191),
 ('200접시', 5.983936280687191),
 ('미국헐리우드', 5.983936280687191),
 ('사용시간', 5.983936280687191),
 ('너무비싸다', 5.983936280687191),
 ('스콘첼레스', 5.578471172579026),
 ('문앞', 5.578471172579026),
 ('나무테이블', 5.578471172579026),
 ('

In [309]:
datecorpus['2015-05-06']['pmi']

[('투파이브', 7.777373602657861),
 ('캔슬링', 7.777373602657861),
 ('Hz이하', 7.777373602657861),
 ('북부연방', 7.777373602657861),
 ('연방지방법원', 7.777373602657861),
 ('지방법원배심', 7.777373602657861),
 ('쿡CEO', 7.777373602657861),
 ('가구및', 7.777373602657861),
 ('용데이터', 7.777373602657861),
 ('mm단위', 7.777373602657861),
 ('리더십아래', 7.777373602657861),
 ('및주방', 7.777373602657861),
 ('즈역시', 7.777373602657861),
 ('데이터송수', 7.777373602657861),
 ('새로운카테고리', 7.777373602657861),
 ('아직출시일', 7.777373602657861),
 ('생참치', 7.777373602657861),
 ('이면든든한', 7.777373602657861),
 ('끼식사', 7.777373602657861),
 ('무조건포장', 7.777373602657861),
 ('곧수백', 7.777373602657861),
 ('입소문', 7.777373602657861),
 ('빅아일랜드', 7.777373602657861),
 ('불만글', 7.777373602657861),
 ('다른레스토랑', 7.777373602657861),
 ('자연친', 7.777373602657861),
 ('라운드프리', 7.777373602657861),
 ('각종음식', 7.777373602657861),
 ('조명도마련', 7.777373602657861),
 ('보증기간', 7.777373602657861),
 ('정확히측정', 7.777373602657861),
 ('대한설명', 7.777373602657861),
 ('거나새롭', 7.777373602657861),
 (

In [301]:
a = list(datecorpus.keys())
a

['2015-05-04',
 '2015-05-06',
 '2015-05-07',
 '2015-05-12',
 '2015-05-14',
 '2015-05-15',
 '2015-05-18',
 '2015-05-19',
 '2015-05-20',
 '2015-05-21',
 '2015-05-22',
 '2015-05-26',
 '2015-05-28',
 '2015-05-29',
 '2015-06-01',
 '2015-06-02',
 '2015-06-03',
 '2015-06-04',
 '2015-06-05',
 '2015-06-08',
 '2015-06-09',
 '2015-06-10',
 '2015-06-11',
 '2015-06-12',
 '2015-06-15',
 '2015-06-16',
 '2015-06-17',
 '2015-06-18',
 '2015-06-19',
 '2015-06-23',
 '2015-06-24',
 '2015-06-25',
 '2015-06-29',
 '2015-06-30',
 '2015-07-01',
 '2015-07-03',
 '2015-07-06',
 '2015-07-08',
 '2015-07-09',
 '2015-07-10',
 '2015-07-13',
 '2015-07-14',
 '2015-07-15',
 '2015-07-16',
 '2015-07-20',
 '2015-07-21',
 '2015-07-22',
 '2015-07-23',
 '2015-07-24',
 '2015-07-27',
 '2015-07-28',
 '2015-07-31',
 '2015-08-03',
 '2015-08-04',
 '2015-08-05',
 '2015-08-06',
 '2015-08-07',
 '2015-08-10',
 '2015-08-11',
 '2015-08-12',
 '2015-08-13',
 '2015-08-14']

In [None]:
import matplotlib.pyplot as plt
plt.plot(year,pop)
plt.show()