# TF-IDF
---

In [2]:
import csv
from konlpy.tag import Kkma
from math import log2, log10
import nltk
import pickle
import re
import sqlite3

In [3]:
k = Kkma()

In [4]:
section_list = ['society', 'politics', 'economic', 'culture', 'digital', 'global']
section_dict = {'society':'사회', 'politics':'정치', 'economic':'경제',
               'culture':'문화', 'digital':'IT', 'global':'세계'}

date_list = ['2018-08-23', '2018-08-22', '2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17',
             '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-11', '2018-08-10']

## Hyperparameters
---

In [5]:
k_ratio = 0.5

## 주요 변수
---

In [6]:
# 임시로 데이터를 보관하는 변수. DB 또는 pickle로 저장함.
# 여러 개의 기준이 있을 수 있음. 주석 참조.
a_nouns_tf = {}          # Article
a_noun_max_cnt = {}      # Article
inverted_idx = {}        # 전체, 기간, section + 기간
unique_nouns_idf = {}    # 전체, 기간, section + 기간

* 이 형태 아님. 맨 앞에 section으로 된 key 빼야 함.


**a_nouns_tf**  
{'society': {'da_20180822201040252': {'인천': 0.1, '남동': 0.5, '남동공단': 0.8, ...}  
...  
}}


**a_noun_max_cnt**  
{'society': {'da_20180822201040252': 4, 'da_20180822201030249': 20, 'da_20180822200910228': 3, ...}  
...  
}


**inverted_idx**  
{'society': {'김영호': ['da_20180822194016575'],  
'심상치가': ['da_20180822194312638'],  
'최우수': ['da_20180822192938304', 'da_20180822191858060'], ...}  
...  
}


**unique_nouns_idf**

## Create DB
---

In [6]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

try:
    cur.execute("CREATE TABLE Term( \
                a_t_id      INTEGER PRIMARY KEY, \
                a_id        TEXT, \
                term        TEXT, \
                tf_article  REAL, \
                tfidf       REAL, \
                FOREIGN KEY (a_id) REFERENCES Article(a_id))")

    conn.commit()
except:
    pass

In [7]:
# cur.execute("DROP TABLE Term")

# conn.commit()

## 불용어 제거
---

* by 한홀

In [8]:
def NewsStopWord(word):
    try:
        int(word) #숫자일경우
    except:        
        if type(word) is str and word.__contains__('회차'):
            return True
        if len(word) == 1:
            # 한글자 빠짐
            return True
        if re.search(r'\d+',word) != None:
            # 숫자가 하나라도 포함되면
            return True
               
        newsDic = {"기자":1,"배포":1,"금지":1,"뉴스":1,"저작권자":1,
                   "기사":1,"전재":1,"무단":1,"무단전재":1,"구독":1,"기사보기":1}
        
        pressDic = {'연합뉴스':1,'뉴시스':1,'뉴시스통신사':1,'통신사':1,
                    '이데일리':1,'네이버':1,'다음':1,'시스':1,'뉴스1':1,'뉴스1코리':1}
        
        nothingDic = {'사진':1,'페이스북':1,'관련':1,'웹툰보기':1,'가기':1,'만큼':1,
                     '최근':1,'재인':1,'올해':1,'시간':1,'판단':1,'추진':1,'우리':1,'반영':1,
                      '상황':1,'호텔':1,'운영':1,'주요':1,'적극':1,'대상':1,'때문':1,
                      '확인':1,'가능':1,'이야기':1,'규모':1,'개월':1,'종합':1,'위원회':1,
                      '가운데':1,'분석':1,'다양':1,'문제':1,'기간':1,'마련':1,'지난해':1,'신청':1,'한편':1,'기준':1,
                      '내용':1,'채널설정':1,'경우':1,'방안':1,'활용':1,'여러분':1,'기존':1,'최대':1,'스냅':1,'오전':1,'대비':1,
                      '위원':1,'지난달':1,'이번달':1,'다음달':1,'위원장':1,'센터':1,'포함':1,'등에':1,'사진영상부':1,
                      '구성':1,'수준':1,'기대':1,'공동':1,'안내':1,'활동':1,'첫날':1,'추가':1,'분야':1,'관리':1,
                      '동안':1,'이용':1,'모습':1,'오늘':1,'논의':1,'입장':1,'업계':1,'내년':1,'블록':1,'체인':1,'실시간':1,
                      '고객':1,'채널':1,'보기':1,'오후':1,'이번':1,'이날':1,'진행':1,'제공':1,'예정':1,'연합':1,'대표':1,
                      '제보':1,'이상':1,'지원':1,'행사':1,'관계자':1,'설정':1,'계획':1,'단체':1,'타임':1,'이후':1,'발표':1
                     }
        if word in newsDic.keys() or word in pressDic.keys() or word in nothingDic.keys():
            return True
        return False
    else:
        return True

## Extract Nouns and Compute Term Frequency (TF)
---

In [9]:
def is_number(obj):
    try:
        float(obj)
        return True
    except ValueError:
        return False

In [10]:
def compute_tf(a_id, content):
    a_nouns_tf[a_id] = {}
    nouns_cnt = {}
    a_noun_max_cnt[a_id] = 0

    # 연속된 공백 및 개행 제거
    content = re.sub(r'[\s]{2,}', ' ', content)
    content = re.sub(r'[\n]{2,}', '\n', content)

    # 문장 단위 토큰화
    for idx, sentence in enumerate(nltk.sent_tokenize(content)):
        nouns_temp = []

        # 단어 단위 토큰화
        for word in nltk.word_tokenize(sentence.strip()):
            # 명사 추출
            nouns = k.nouns(word)

            for noun in nouns:
                # 불용어 제거
                if NewsStopWord(noun):
                    continue

                if noun in nouns_cnt.keys():
                    nouns_cnt[noun] += 1
                else:
                    nouns_cnt[noun] = 1

                if a_noun_max_cnt[a_id] < nouns_cnt[noun]:
                    a_noun_max_cnt[a_id] = nouns_cnt[noun]

#             # 기본 불용어 제거
#             for stop_word in stop_words:
#                 nouns.remove(stop_word)

#             nouns_temp.extend(nouns)
#             nouns_temp = list(set(nouns_temp))

#         unique_nouns.extend(nouns_temp)
#         unique_nouns = list(set(unique_nouns))

    for noun in nouns_cnt.keys():
        a_nouns_tf[a_id][noun] = k_ratio + (1 - k_ratio) * nouns_cnt[noun] / a_noun_max_cnt[a_id]

## Insert Nouns and TFs into Term Table
---

In [31]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# Date iteration
# 기사 수가 너무 많아서 날짜별로 분할하여 처리
for date in date_list:
    print(date)
    print('--------------------------------------------------\n')
    
    a_nouns_tf = {}
    a_noun_max_cnt = {}
    
    cur.execute("SELECT a_id, content FROM Article WHERE date = '{0}'".format(date))

    data = cur.fetchall()
    
    # Content iteration
    for data_idx, d in enumerate(data):
        if (data_idx % 5000) == 0:
            print('{0:6,} / {1:6,}'.format(data_idx, len(data)))
        
        compute_tf(a_id=d[0], content=d[1])
        
    # Insert data
    for a_id, nouns_tf in a_nouns_tf.items():
        for noun, tf in nouns_tf.items():
            data = (a_id, noun, tf)
            try:
                cur.execute("INSERT INTO Term(a_id, term, tf_article) \
                            VALUES(?,?,?)", data)
            except:
                pass
            else:
                conn.commit()

    print('\n--------------------------------------------------\n\n')

## [Debug] Extract Nouns and Compute Term Frequency (TF)
---

In [11]:
# a_nouns_tf = {}
# a_noun_max_cnt = {}

# a_id = 'da_20180823143529578'
# content = """제19호 태풍 ‘솔릭’이 제주도를 지나 북상하면서 광주 지역 모든 학교 학생들이 조기 하교 했다. 항공편은 모두 결항됐으며 무등산 입산이 통제됐다. 

#            23일 오후 광주 서구 하늘에 제19호 태풍 솔릭이 몰고온 먹구름이 가득하다.
          

# 23일 광주지방기상청에 따르면 제19호 태풍 ‘솔릭’은 이날 낮 12시 현재 제주 서귀포 서쪽 90km 부근 해상에서 시속 4km로 북진 하고 있다. 기상청은 태풍이 이날 오후 6시쯤 목포 서남쪽 80㎞ 해상을 지난뒤 오는 24일 새벽 전북 군산 인근으로 상륙할 것으로 보고 있다. 
# 솔릭이 예상보다 훨씬 느린 속도로 접근하면서 광주 시민들은 ‘조마조마’한 심정으로 피해 예방을 위해 총력을 다했다. 태풍의 영향으로 광주 지역에는 바람이 점차 강해지면서 이날 제주와 김포를 오가는 광주공항의 모든 항공편이 결항됐다. 무등산도 입산이 통제됐다. 
# 태풍이 접근하면서 광주시교육청은 전체 학교를 대상으로 ‘조기 하교’를 결정했다. 광주지역 유치원과 초·중·고는 오후 3시 이전에 조기 하교했다. 고등학교의 아간 자율학습도 금지됐다. 교육청은 학원에도 휴원을 적극 검토하고 하원 시간을 조정하도록 요청했다. 
# 광주·전남은 24일까지 100∼250㎜의 비가 내리고 해안과 지리산에는 400㎜ 넘게 내리는 곳도 있겠다. 광주·전남은 태풍의 중심에서 반경 25m 범위 안에 들어 바람도 강하게 불 것으로 기상청은 내다보고 있다."""

# compute_tf(a_id, content)

# a_nouns_tf

In [67]:
# conn = sqlite3.connect('db/daum.db')
# cur = conn.cursor()

# # Date iteration
# # 기사 수가 너무 많아서 날짜별로 분할하여 처리
# for date in date_list:
#     date = date.replace('-', '.')
#     print(date)
#     print('--------------------------------------------------\n')
    
#     a_nouns_tf = {}
#     a_noun_max_cnt = {}
    
#     cur.execute("SELECT a_ids, contents FROM daum WHERE dates = '{0}'".format(date))

#     data = cur.fetchall()
    
#     # Content iteration
#     for data_idx, d in enumerate(data):
#         if (data_idx % 2000) == 0:
#             print('{0:6,} / {1:6,}'.format(data_idx, len(data)))
        
#         compute_tf(a_id=d[0], content=d[1])
        
#     # Insert data
#     for a_id, nouns_tf in a_nouns_tf.items():
#         for noun, tf in nouns_tf.items():
#             data = (a_id, noun, tf)
#             try:
#                 cur.execute("INSERT INTO Term(a_id, term, tf_article) \
#                             VALUES(?,?,?)", data)
#             except:
#                 pass
#         else:
#             conn.commit()

#     print('\n--------------------------------------------------\n\n')

## Extract Unique Nouns, Invert Index and
## Compute Inverse Document Frequency (IDF)
---

* Inverted index and IDF (전체)

In [32]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

inverted_idx = {}
unique_nouns_idf = {}

cur.execute("SELECT a_id, term FROM Term")

data = cur.fetchall()

# Noun iteration
for data_idx, d in enumerate(data):
    if (data_idx % 100000) == 0:
        print('{0:10,} / {1:10,}'.format(data_idx, len(data)))
    
    noun = d[1]
    a_id = d[0]
    
    if noun in inverted_idx.keys():
        inverted_idx[noun].append(a_id)
    else:
        inverted_idx[noun] = []
        inverted_idx[noun].append(a_id)

cur.execute("SELECT COUNT(*) FROM Article")

a_size = cur.fetchone()[0]
        
for noun, a_ids in inverted_idx.items():
    df = len(a_ids)
    unique_nouns_idf[noun] = log10(a_size / df)
        
with open('index/inverted_index/inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_idx, f)
    
with open('index/unique_nouns_idf/unique_nouns_idf.pkl', 'wb') as f:
    pickle.dump(unique_nouns_idf, f)

         0 /      2,781


In [33]:
# [Debug] Inverted index
unique_noun = tuple(inverted_idx.keys())[0]

print('The number of unique nouns: ', len(inverted_idx), '\n')
print(unique_noun)
print('DF: ', len(inverted_idx[unique_noun]))
inverted_idx[unique_noun][:10]

The number of unique nouns:  2063 

화재
DF:  3


['da_20180823235953274', 'da_20180818235833634', 'da_20180810234631844']

In [34]:
# [Debug] Unique nouns IDF
unique_nouns = tuple(unique_nouns_idf.keys())[:10]

print('The number of IDF unique nouns: ', len(unique_nouns_idf), '\n')
for unique_noun in unique_nouns:
    print('{0} | {1:5.3}'.format(unique_noun, unique_nouns_idf[unique_noun]))

The number of IDF unique nouns:  2063 

화재 |  4.66
화재원인 |  5.14
원인 |  4.36
어디 |  4.84
인천 |  4.44
윤태현 |  5.14
태현 |  5.14
인천시 |  5.14
남동 |  5.14
남동구 |  5.14


* Inverted index and IDF (기간)

In [35]:
# [Debug]
for date_idx in range(len(date_list)-6):
    print(date_list[date_idx:date_idx+7])

['2018-08-23', '2018-08-22', '2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17']
['2018-08-22', '2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16']
['2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15']
['2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14']
['2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13']
['2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12']
['2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-11']
['2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-11', '2018-08-10']


In [40]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# Date (기간) iteration
for date_idx in range(len(date_list)-6):
    inverted_idx = {}
    unique_nouns_idf = {}
    
    data = []
    a_size = 0
    
    # Date (기간에 속하는 각 날짜) iteration
    for date in date_list[date_idx:date_idx+7]:
        cur.execute("SELECT T.a_id, T.term \
                    From Article A, Term T \
                    WHERE A.a_id = T.a_id AND A.date = '{0}'".format(date))

        data.extend(cur.fetchall())
        
        cur.execute("SELECT COUNT(*) FROM Article WHERE date = '{0}'".format(date))

        a_size += cur.fetchone()[0]

    print('{0} | {1:10,} | {2:10,}'.format(date_list[date_idx], len(data), a_size))
        
    # Noun iteration
    for data_idx, d in enumerate(data):
        if (data_idx % 100000) == 0:
            print('{0:10,} / {1:10,}'.format(data_idx, len(data)))

        noun = d[1]
        a_id = d[0]

        if noun in inverted_idx.keys():
            inverted_idx[noun].append(a_id)
        else:
            inverted_idx[noun] = []
            inverted_idx[noun].append(a_id)
            
    for noun, a_ids in inverted_idx.items():
        df = len(a_ids)
        unique_nouns_idf[noun] = log10(a_size / df)

    with open('index/inverted_index/inverted_index_' + date_list[date_idx] + '.pkl', 'wb') as f:
        pickle.dump(inverted_idx, f)
        
    with open('index/unique_nouns_idf/unique_nouns_idf_' + date_list[date_idx] + '.pkl', 'wb') as f:
        pickle.dump(unique_nouns_idf, f)

2018-08-23 |      1,152 |     71,079
         0 /      1,152
2018-08-22 |      1,292 |     71,510
         0 /      1,292
2018-08-21 |      1,423 |     65,391
         0 /      1,423
2018-08-20 |      1,587 |     65,239
         0 /      1,587
2018-08-19 |      1,488 |     65,999
         0 /      1,488
2018-08-18 |      1,456 |     65,690
         0 /      1,456
2018-08-17 |      1,617 |     65,640
         0 /      1,617
2018-08-16 |      1,629 |     66,006
         0 /      1,629


In [41]:
# [Debug] Inverted index
date = '2018-08-23'

with open('index/inverted_index/inverted_index_{0}.pkl'.format(date), 'rb') as f:
    inverted_idx = pickle.load(f)

unique_noun = tuple(inverted_idx.keys())[0]

print('The number of unique nouns: ', len(inverted_idx), '\n')
print(unique_noun)
print('DF: ', len(inverted_idx[unique_noun]))
inverted_idx[unique_noun][:10]

The number of unique nouns:  944 

화재
DF:  2


['da_20180823235953274', 'da_20180818235833634']

In [42]:
# [Debug] Unique nouns IDF
date = '2018-08-23'

with open('index/unique_nouns_idf/unique_nouns_idf_{0}.pkl'.format(date), 'rb') as f:
    unique_nouns_idf = pickle.load(f)
    
unique_nouns = tuple(unique_nouns_idf.keys())[:10]

print('The number of IDF unique nouns: ', len(unique_nouns_idf), '\n')
for unique_noun in unique_nouns:
    print('{0} | {1:5.3}'.format(unique_noun, unique_nouns_idf[unique_noun]))

The number of IDF unique nouns:  944 

화재 |  4.55
화재원인 |  4.85
원인 |  4.37
어디 |  4.85
인천 |  4.25
윤태현 |  4.85
태현 |  4.85
인천시 |  4.85
남동 |  4.85
남동구 |  4.85


* Inverted index and IDF (section + 기간)

In [43]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# Section interation
for section in section_list:
    # Date (기간) iteration
    for date_idx in range(len(date_list)-6):
        inverted_idx = {}
        unique_nouns_idf = {}
        
        data = []
        a_size = 0
    
        # Date (기간에 속하는 각 날짜) iteration
        for date in date_list[date_idx:date_idx+7]:
            cur.execute("SELECT T.a_id, T.term \
                        From Article A, Term T \
                        WHERE A.a_id = T.a_id AND A.section = '{0}' AND A.date = '{1}'".format(section_dict[section], date))

            data.extend(cur.fetchall())
            
            cur.execute("SELECT COUNT(*) FROM Article WHERE section = '{0}' AND date = '{1}'".format(section_dict[section], date))

            a_size += cur.fetchone()[0]

        print('{0} | {1} | {2:10,} | {3:10,}'.format(section, date_list[date_idx], len(data), a_size))

        # Noun iteration
        for data_idx, d in enumerate(data):
#             if (data_idx % 100000) == 0:
#                 print('{0:10,} / {1:10,}'.format(data_idx, len(data)))

            noun = d[1]
            a_id = d[0]

            if noun in inverted_idx.keys():
                inverted_idx[noun].append(a_id)
            else:
                inverted_idx[noun] = []
                inverted_idx[noun].append(a_id)
                
        for noun, a_ids in inverted_idx.items():
            df = len(a_ids)
            unique_nouns_idf[noun] = log10(a_size / df)

        with open('index/inverted_index/inverted_index_' + section + '_' + date_list[date_idx] + '.pkl', 'wb') as f:
            pickle.dump(inverted_idx, f)
            
        with open('index/unique_nouns_idf/unique_nouns_idf_' + section + '_' + date_list[date_idx] + '.pkl', 'wb') as f:
            pickle.dump(unique_nouns_idf, f)

society | 2018-08-23 | 1,152
society | 2018-08-22 | 1,292
society | 2018-08-21 | 1,423
society | 2018-08-20 | 1,587
society | 2018-08-19 | 1,488
society | 2018-08-18 | 1,456
society | 2018-08-17 | 1,617
society | 2018-08-16 | 1,629
politics | 2018-08-23 | 0
politics | 2018-08-22 | 0
politics | 2018-08-21 | 0
politics | 2018-08-20 | 0
politics | 2018-08-19 | 0
politics | 2018-08-18 | 0
politics | 2018-08-17 | 0
politics | 2018-08-16 | 0
economic | 2018-08-23 | 0
economic | 2018-08-22 | 0
economic | 2018-08-21 | 0
economic | 2018-08-20 | 0
economic | 2018-08-19 | 0
economic | 2018-08-18 | 0
economic | 2018-08-17 | 0
economic | 2018-08-16 | 0
culture | 2018-08-23 | 0
culture | 2018-08-22 | 0
culture | 2018-08-21 | 0
culture | 2018-08-20 | 0
culture | 2018-08-19 | 0
culture | 2018-08-18 | 0
culture | 2018-08-17 | 0
culture | 2018-08-16 | 0
digital | 2018-08-23 | 0
digital | 2018-08-22 | 0
digital | 2018-08-21 | 0
digital | 2018-08-20 | 0
digital | 2018-08-19 | 0
digital | 2018-08-18 | 0
di

In [44]:
# [Debug]
section = 'society'
date = '2018-08-23'

with open('index/inverted_index/inverted_index_{0}_{1}.pkl'.format(section, date), 'rb') as f:
    inverted_idx = pickle.load(f)
    
unique_noun = tuple(inverted_idx.keys())[0]

print('The number of unique nouns: ', len(inverted_idx), '\n')
print(unique_noun)
print('DF: ', len(inverted_idx[unique_noun]))
inverted_idx[unique_noun][:10]

The number of unique nouns:  944 

화재
DF:  2


['da_20180823235953274', 'da_20180818235833634']

In [46]:
# [Debug] Unique nouns IDF
section = 'society'
date = '2018-08-23'

with open('index/unique_nouns_idf/unique_nouns_idf_{0}_{1}.pkl'.format(section, date), 'rb') as f:
    unique_nouns_idf = pickle.load(f)
    
unique_nouns = tuple(unique_nouns_idf.keys())[:10]

print('The number of IDF unique nouns: ', len(unique_nouns_idf), '\n')
for unique_noun in unique_nouns:
    print('{0} | {1:5.3}'.format(unique_noun, unique_nouns_idf[unique_noun]))

The number of IDF unique nouns:  944 

화재 |  4.15
화재원인 |  4.45
원인 |  3.97
어디 |  4.45
인천 |  3.85
윤태현 |  4.45
태현 |  4.45
인천시 |  4.45
남동 |  4.45
남동구 |  4.45


## Insert IDFs into Term Table
---

* DB에는 2018-08-23만 기록 (2018-08-17 - 2018-08-23)
* 기간(1주일)마다 pickle 저장

In [47]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

date = '2018-08-23'
unique_nouns_idf = {}

with open('index/unique_nouns_idf/unique_nouns_idf_{0}.pkl'.format(date), 'rb') as f:
    unique_nouns_idf = pickle.load(f)

for unique_noun_idx, item in enumerate(unique_nouns_idf.items()):
    if (unique_noun_idx % 10000) == 0:
        print('{0:10,} / {1:10,}'.format(unique_noun_idx, len(unique_nouns_idf)))
    
    unique_noun = item[0]
    idf = item[1]
    
    cur.execute("SELECT T.a_t_id, T.tf_article \
                FROM Article A, Term T \
                WHERE A.a_id = T.a_id AND A.date = '{0}' AND T.term = '{1}'".format(date, unique_noun))

    data = cur.fetchall()
    
    for d in data:
        a_t_id = d[0]
        tf = d[1]
        
        cur.execute("UPDATE Term \
                    SET tfidf = {0} \
                    WHERE a_t_id = {1}"
                    .format(tf * idf, a_t_id))
    else:
        conn.commit()

         0 /        944


# TODO:

In [29]:
# # [Debug]
# conn = sqlite3.connect('db/daum.db')
# cur = conn.cursor()

# unique_nouns_idf = {'태풍': 1., '솔릭': 100., '제주': 10000.}

# for unique_noun_idx, item in enumerate(unique_nouns_idf.items()):
#     if (unique_noun_idx % 10000) == 0:
#         print('{0:10,} / {1:10,}'.format(unique_noun_idx, len(unique_nouns_idf)))
    
#     unique_noun = item[0]
#     idf = item[1]
#     print(unique_noun, idf)
    
#     cur.execute("SELECT a_t_id, tf_article FROM Term WHERE term = '{0}'".format(unique_noun))

#     data = cur.fetchall()
    
#     for d in data:
#         a_t_id = d[0]
#         tf = d[1]
        
#         cur.execute("UPDATE Term \
#                     SET tfidf = {0} \
#                     WHERE a_t_id = {1}"
#                     .format(tf * idf, a_t_id))
#     else:
#         conn.commit()

## Create U_Term Table and
## Insert Unique Nouns and IDFs into the Table
---

* DB에는 2018-08-23만 기록 (2018-08-17 - 2018-08-23)
* 기간(1주일)마다 pickle 저장

In [None]:
u_term, idf

# TODO:

## Dump Pickles
---

## 위 셀 코딩 중. 아래는 예전 코드이므로 참고만!
---

In [12]:
for date in date_list[:-6]:
    print(date)

2018-08-23
2018-08-22
2018-08-21
2018-08-20
2018-08-19
2018-08-18
2018-08-17
2018-08-16


In [4]:
def inverted_index(db_name, table_name, sections=['society', 'politics', 'economic', 'culture', 'digital', 'global'], write=False):
    k = Kkma()
    
    conn = sqlite3.connect('db/' + db_name + '.db')
    cur = conn.cursor()

    for section in sections:
        print(section_dict[section])
        print('--------------------------------------------------\n')

        cur.execute("SELECT a_id, content FROM {0} WHERE section = '{1}'".format(table_name, section_dict[section]))

        contents = cur.fetchall()
        
        unique_nouns = []
        a_nouns = {}
        a_noun_max_cnt = {}
        inverted_idx = {}

        for content_idx, content in enumerate(contents):
            if (content_idx % 100) == 0:
                print('{0:6,} / {1:6,}'.format(content_idx, len(contents)))

            a_id = content[0]
            a_nouns[a_id] = {}
            nouns_cnt = {}
            a_noun_max_cnt[a_id] = 0

            # 연속된 공백 및 개행 제거
            content = re.sub(r'[\s]{2,}', ' ', content[1])
            content = re.sub(r'[\n]{2,}', '\n', content)

            # 문장 단위 토큰화
            for idx, sentence in enumerate(nltk.sent_tokenize(content)):
                nouns_temp = []

        #         print(idx)
        #         print('------------------------------')
        #         print(sentence.strip())

                # 단어 단위 토큰화
                for word in nltk.word_tokenize(sentence.strip()):
                    # 명사 추출
                    nouns = k.nouns(word)
                    
                    stop_words = []

                    for noun in nouns:
                        # 기본 불용어 선별 (공통)
                        # 길이가 1인 단어
                        if len(noun) == 1:
                            stop_words.append(noun)
                            continue
                        # 숫자만으로 이루어진 단어
                        elif is_number(noun):
                            stop_words.append(noun)
                            continue

                        # 기본 불용어 선별 (섹션별)
                        # TODO:
#                         if section == 'society':
#                             pass
#                         elif section == 'politics':
#                             pass
#                         elif section == 'economic':
#                             pass
#                         elif section == 'culture':
#                             pass
#                         elif section == 'digital':
#                             pass
#                         elif section == 'global':
#                             pass

                        if noun in nouns_cnt.keys():
                            nouns_cnt[noun] += 1
                        else:
                            nouns_cnt[noun] = 1

                        if a_noun_max_cnt[a_id] < nouns_cnt[noun]:
                            a_noun_max_cnt[a_id] = nouns_cnt[noun]
                    
                    # 기본 불용어 제거
                    for stop_word in stop_words:
                        nouns.remove(stop_word)

                    nouns_temp.extend(nouns)
                    nouns_temp = list(set(nouns_temp))

                unique_nouns.extend(nouns_temp)
                unique_nouns = list(set(unique_nouns))
                
#                 print('------------------------------\n\n')

            a_nouns[a_id] = nouns_cnt

        for noun in unique_nouns:
            inverted_idx[noun] = []

            for a_id, nouns in a_nouns.items():
                if noun in nouns:
                    inverted_idx[noun].append(a_id)
                    
        if write:
            print('Start to write files.')
            
            with open('index/original/' + section + '_unique_nouns.csv', 'w', newline='') as f:
                csv_writer = csv.writer(f)
                csv_writer.writerow(unique_nouns)

            with open('index/original/' + section + '_article_nouns.csv', 'w', newline='') as f:
                csv_writer = csv.writer(f)
                for a_id, nouns in a_nouns.items():
                    csv_writer.writerow([a_id, nouns])
#                     for noun in nouns:
#                         csv_writer.writerow([a_id, noun])

#             with open('index/original/' + section + '_noun_max_count.csv', 'w', newline='') as f:
#                 csv_writer = csv.writer(f)
#                 for a_id, max_cnt in a_noun_max_cnt.items():
#                     csv_writer.writerow([a_id, max_cnt])

            with open('index/original/' + section + '_inverted_index.csv', 'w', newline='') as f:
                csv_writer = csv.writer(f)
                for noun, a_ids in inverted_idx.items():
                    csv_writer.writerow([noun, a_ids])

            with open('index/original/' + section + '_unique_nouns.pkl', 'wb') as f:
                pickle.dump(unique_nouns, f)

            with open('index/original/' + section + '_article_nouns.pkl', 'wb') as f:
                pickle.dump(a_nouns, f)

            with open('index/original/' + section + '_noun_max_count.pkl', 'wb') as f:
                pickle.dump(a_noun_max_cnt, f)
                
            with open('index/original/' + section + '_inverted_index.pkl', 'wb') as f:
                pickle.dump(inverted_idx, f)

        print('\n--------------------------------------------------\n\n')

In [77]:
inverted_index('daum', 'daum', write=True)

사회
--------------------------------------------------

     0 /    227
   100 /    227
   200 /    227
Start to write files.

--------------------------------------------------


정치
--------------------------------------------------

     0 /    257
   100 /    257
   200 /    257
Start to write files.

--------------------------------------------------


경제
--------------------------------------------------

     0 /    229
   100 /    229
   200 /    229
Start to write files.

--------------------------------------------------


문화
--------------------------------------------------

     0 /    199
   100 /    199
Start to write files.

--------------------------------------------------


IT
--------------------------------------------------

     0 /    237
   100 /    237
   200 /    237
Start to write files.

--------------------------------------------------


세계
--------------------------------------------------

     0 /     65
Start to write files.

---------------------------

In [59]:
# inverted_index('daum', 'daum', sections=['politics'])

## [Debug] Extract Nouns and Index Articles
---

In [19]:
# 정치
for section in section_list[1:2]:
    with open('index/original/' + section + '_unique_nouns.pkl', 'rb') as f:
        unique_nouns = pickle.load(f)
    
    with open('index/original/' + section + '_article_nouns.pkl', 'rb') as f:
        a_nouns = pickle.load(f)
        
    with open('index/original/' + section + '_noun_max_count.pkl', 'rb') as f:
        a_noun_max_cnt = pickle.load(f)
        
    with open('index/original/' + section + '_inverted_index.pkl', 'rb') as f:
        inverted_idx = pickle.load(f)

In [21]:
conn = sqlite3.connect('db/daum.db')
cur = conn.cursor()

cur.execute("SELECT a_id, content FROM daum WHERE section = '정치'")

a_id, content = cur.fetchone()

print(a_id, '\n')
print(content)

da_20180822211258392 

6·13 지방선거에 바른미래당 서울시장 후보로 나섰다가 참패한 뒤 독일 등 해외에 머물겠다고 했던 안철수 전 의원이 서울 마포구에서 포착됐다.
아주경제는 지난 21일 마포구의 한 사무실에서 기자와 마주치자 도망치는 안 전 의원의 모습(사진)을 22일 공개했다. 이 매체가 공개한 동영상에는 안 전 의원이 기자를 피해 황급히 계단을 내려가는 장면이 담겼다. 기자가 “죄를 지으신 게 아니지 않느냐”며 거듭 취재를 요청했지만 안 전 의원은 일절 답하지 않고 계단 아래쪽으로 뛰어 내려갔다.
지난달 12일 안 전 의원은 기자회견을 열고 “정치 일선에서 물러나 통찰과 채움의 시간을 갖고자 한다”며 “대한민국이 당면한 시대적 난제를 앞서 해결하고 있는 독일에서 해결의 실마리를 얻겠다”고 말했었다. 그런 그가 당대표 선거가 한창인 지금 서울에 머물고 있는 게 의아하다는 반응도 나온다.
안 전 의원이 서울에서 포착됐다는 보도에 대해 이준석 바른미래당 당대표 후보는 페이스북에 “이런 상황에서 음험한 계략을 꾸미는 분이 아니고 도망가실 분도 아니다. 그냥 바쁘셔서 그러셨을 거다”라고 썼다.


In [22]:
unique_nouns.sort()

len(unique_nouns)

8198

In [37]:
unique_nouns[0:10]

['0.2㎜',
 '0.4㎜',
 '0.6㎜',
 '0.86배',
 '0.95배',
 '08월',
 '1.43배',
 '1.4배',
 '1.5배',
 '10.5㎜']

In [24]:
len(a_nouns)

257

In [47]:
for a_id, nouns in a_nouns.items():
    print(a_id)
    print('Max count: ', a_noun_max_cnt[a_id])
    print('------------------------------\n')
    
    keys = list(nouns.keys())
    keys.sort()
    
    for k in keys:
        # 두 번 이상 나온 단어만 출력
        if nouns[k] >= 2:
            print('{0} : {1:,}'.format(k, nouns[k]))
        else:
#             print('{0} : {1:,}'.format(k, nouns[k]))
            pass
    
    print('\n------------------------------\n\n')
    
    break

da_20180822211258392
Max count:  6
------------------------------

계단 : 2
공개 : 2
기자 : 3
당대표 : 2
대표 : 2
독일 : 2
마포 : 2
마포구 : 2
미래 : 2
미래당 : 2
서울 : 4
의원 : 6
포착 : 2
해결 : 2
후보 : 2

------------------------------




In [48]:
cnt = 0

for noun, a_ids in inverted_idx.items():
    if cnt == 10:
        break
    
    # 전체 기사에서 10번 이상 나온 단어만 출력 (10개)
    if len(a_ids) >= 10:
        print('{0} : {1:,}'.format(noun, len(a_ids)))
        
        cnt += 1
    else:
#         print('{0} : {1:,}'.format(noun, len(a_ids)))
        pass

10시 : 11
제안 : 13
기관 : 35
조카 : 10
박자 : 10
압박 : 19
여동생 : 13
국방 : 26
방안 : 35
마련 : 28


## TF, DF and TF-IDF
---

* Parameters

In [5]:
k_ratio = 0.5

* Load data

In [6]:
unique_nouns = {}
a_nouns = {}
a_noun_max_cnt = {}
inverted_idx = {}

for section in section_list:
    with open('index/stopwords_removal/' + section + '_unique_nouns.pkl', 'rb') as f:
        unique_nouns[section] = pickle.load(f)
    
    with open('index/stopwords_removal/' + section + '_article_nouns.pkl', 'rb') as f:
        a_nouns[section] = pickle.load(f)
        
    with open('index/stopwords_removal/' + section + '_noun_max_count.pkl', 'rb') as f:
        a_noun_max_cnt[section] = pickle.load(f)
        
    with open('index/stopwords_removal/' + section + '_inverted_index.pkl', 'rb') as f:
        inverted_idx[section] = pickle.load(f)

In [7]:
section = 'politics'

len(unique_nouns[section])

7410

## 기사별 TF
---

In [8]:
a_tf = {}

for section in section_list:
    print(section)
    print('------------------------------\n')
    
    a_tf[section] = {}

    for a_id, nouns in a_nouns[section].items():
        tf_temp = {}

        for noun in nouns:
            # Double normalization K
            tf_temp[noun] = k_ratio + (1-  k_ratio) * (nouns[noun] / a_noun_max_cnt[section][a_id])
            
        a_tf[section][a_id] = tf_temp
        
        for noun in nouns:
            if a_tf[section][a_id][noun] > 0.7:
                print(a_id)
                print("{0} | {1} + {2} * ({3} / {4}) = {5}\n".format(noun, k_ratio, (1 - k_ratio), nouns[noun], a_noun_max_cnt[section][a_id], tf_temp[noun]))
                break
                
    print(len(a_tf[section]))
    print('\n------------------------------\n\n')



In [96]:
# section = 'politics'
# a_id = tuple(a_tf[section].keys())[0]

# a_tf[section][a_id]

## 섹션별 DF

In [9]:
s_df = {}
a_max_cnt = {}

for section in section_list:
    print(section)
    print('------------------------------\n')
    
    # Section의 article 수
    a_size = len(a_tf[section])
    a_max_cnt[section] = 0
    s_df[section] = {}
    
    for _, a_ids in inverted_idx[section].items():
        if a_max_cnt[section] < len(a_ids):
            a_max_cnt[section] = len(a_ids)
    
    for noun, a_ids in inverted_idx[section].items():
        # Double normalization K
        s_df[section][noun] = k_ratio + (1-  k_ratio) * (len(a_ids) / a_max_cnt[section])
        
        if s_df[section][noun] > 0.7:
            print("{0} | {1} + {2} * ({3} / {4}) = {5}".format(noun, k_ratio, (1 - k_ratio), len(a_ids), a_max_cnt[section], s_df[section][noun]))

    print('\n', len(s_df[section]))
    print('\n------------------------------\n\n')

society
------------------------------

바람 | 0.5 + 0.5 * (32 / 66) = 0.7424242424242424
혐의 | 0.5 + 0.5 * (28 / 66) = 0.7121212121212122
당시 | 0.5 + 0.5 * (28 / 66) = 0.7121212121212122
필요 | 0.5 + 0.5 * (27 / 66) = 0.7045454545454546
태풍 | 0.5 + 0.5 * (66 / 66) = 1.0
한반도 | 0.5 + 0.5 * (27 / 66) = 0.7045454545454546
솔릭 | 0.5 + 0.5 * (60 / 66) = 0.9545454545454546
경찰 | 0.5 + 0.5 * (43 / 66) = 0.8257575757575757
예상 | 0.5 + 0.5 * (34 / 66) = 0.7575757575757576
안전 | 0.5 + 0.5 * (27 / 66) = 0.7045454545454546
북상 | 0.5 + 0.5 * (35 / 66) = 0.7651515151515151
강풍 | 0.5 + 0.5 * (33 / 66) = 0.75
정부 | 0.5 + 0.5 * (31 / 66) = 0.7348484848484849
앵커 | 0.5 + 0.5 * (41 / 66) = 0.8106060606060606
제주 | 0.5 + 0.5 * (31 / 66) = 0.7348484848484849
우려 | 0.5 + 0.5 * (36 / 66) = 0.7727272727272727
서울 | 0.5 + 0.5 * (56 / 66) = 0.9242424242424243
경기 | 0.5 + 0.5 * (41 / 66) = 0.8106060606060606
조사 | 0.5 + 0.5 * (43 / 66) = 0.8257575757575757
한국 | 0.5 + 0.5 * (35 / 66) = 0.7651515151515151
비상 | 0.5 + 0.5 * (27 / 66) =

In [99]:
# section = 'politics'
# # noun = tuple(s_df[section].keys())[0]

# s_df[section]