# TF-IDF
---

In [1]:
import csv
from konlpy.tag import Kkma
from math import log2, log10
import nltk
import pickle
import re
import sqlite3

In [2]:
k = Kkma()

In [3]:
section_list = ['society', 'politics', 'economic', 'culture', 'digital', 'global']
section_dict = {'society':'사회', 'politics':'정치', 'economic':'경제',
               'culture':'문화', 'digital':'IT', 'global':'세계'}

date_list = ['2018-08-23', '2018-08-22', '2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17',
             '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-11', '2018-08-10']

## 주요 변수
---

In [4]:
# 임시로 데이터를 보관하는 변수. DB 또는 pickle로 저장함.
# 여러 개의 기준이 있을 수 있음. 주석 참조.
a_nouns_tf = {}          # Article
a_noun_max_cnt = {}      # Article
inverted_idx = {}        # 전체, 기간, section + 기간
unique_nouns_idf = {}    # 전체, 기간, section + 기간

* 이 형태 아님. 맨 앞에 section으로 된 key 빼야 함.


**a_nouns_tf**  
{'society': {'da_20180822201040252': {'인천': 0.1, '남동': 0.5, '남동공단': 0.8, ...}  
...  
}}


**a_noun_max_cnt**  
{'society': {'da_20180822201040252': 4, 'da_20180822201030249': 20, 'da_20180822200910228': 3, ...}  
...  
}


**inverted_idx**  
{'society': {'김영호': ['da_20180822194016575'],  
'심상치가': ['da_20180822194312638'],  
'최우수': ['da_20180822192938304', 'da_20180822191858060'], ...}  
...  
}


**unique_nouns_idf**

## Create DB
---

In [5]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

try:
    cur.execute("CREATE TABLE Term(a_t_id      INTEGER PRIMARY KEY, \
                                   a_id        TEXT, \
                                   term        TEXT, \
                                   tf_article  REAL, \
                                   tfidf       REAL, \
                                   FOREIGN KEY (a_id) REFERENCES Article(a_id))")
    
    conn.commit()
except:
    pass

try:
    cur.execute("CREATE TABLE U_Term(u_term      TEXT PRIMARY KEY, \
                                     idf         REAL)")
    
    conn.commit()
except:
    pass

In [6]:
# cur.execute("DROP TABLE Term")

# conn.commit()

## 불용어 제거
---

* by 한홀

In [7]:
def NewsStopWord(word):
    try:
        int(word) #숫자일경우
    except:        
        if type(word) is str and word.__contains__('회차'):
            return True
        if len(word) == 1:
            # 한글자 빠짐
            return True
        if re.search(r'\d+',word) != None:
            # 숫자가 하나라도 포함되면
            return True
               
        newsDic = {"기자":1,"배포":1,"금지":1,"뉴스":1,"저작권자":1,
                   "기사":1,"전재":1,"무단":1,"무단전재":1,"구독":1,"기사보기":1}
        
        pressDic = {'연합뉴스':1,'뉴시스':1,'뉴시스통신사':1,'통신사':1,
                    '이데일리':1,'네이버':1,'다음':1,'시스':1,'뉴스1':1,'뉴스1코리':1}
        
        nothingDic = {'사진':1,'페이스북':1,'관련':1,'웹툰보기':1,'가기':1,'만큼':1,
                     '최근':1,'재인':1,'올해':1,'시간':1,'판단':1,'추진':1,'우리':1,'반영':1,
                      '상황':1,'호텔':1,'운영':1,'주요':1,'적극':1,'대상':1,'때문':1,
                      '확인':1,'가능':1,'이야기':1,'규모':1,'개월':1,'종합':1,'위원회':1,
                      '가운데':1,'분석':1,'다양':1,'문제':1,'기간':1,'마련':1,'지난해':1,'신청':1,'한편':1,'기준':1,
                      '내용':1,'채널설정':1,'경우':1,'방안':1,'활용':1,'여러분':1,'기존':1,'최대':1,'스냅':1,'오전':1,'대비':1,
                      '위원':1,'지난달':1,'이번달':1,'다음달':1,'위원장':1,'센터':1,'포함':1,'등에':1,'사진영상부':1,
                      '구성':1,'수준':1,'기대':1,'공동':1,'안내':1,'활동':1,'첫날':1,'추가':1,'분야':1,'관리':1,
                      '동안':1,'이용':1,'모습':1,'오늘':1,'논의':1,'입장':1,'업계':1,'내년':1,'블록':1,'체인':1,'실시간':1,
                      '고객':1,'채널':1,'보기':1,'오후':1,'이번':1,'이날':1,'진행':1,'제공':1,'예정':1,'연합':1,'대표':1,
                      '제보':1,'이상':1,'지원':1,'행사':1,'관계자':1,'설정':1,'계획':1,'단체':1,'타임':1,'이후':1,'발표':1
                     }
        if word in newsDic.keys() or word in pressDic.keys() or word in nothingDic.keys():
            return True
        return False
    else:
        return True

## Extract Nouns and Compute Term Frequency (TF)
---

In [8]:
def is_number(obj):
    try:
        float(obj)
        return True
    except ValueError:
        return False

In [9]:
def compute_tf(content, k_ratio=0.5):
    a_nouns_tf= {}
    nouns_cnt = {}
    a_noun_max_cnt = 0

    # 연속된 공백 및 개행 제거
    content = re.sub(r'[\s]{2,}', ' ', content)
    content = re.sub(r'[\n]{2,}', '\n', content)

    # 문장 단위 토큰화
    for sentence in nltk.sent_tokenize(content):
        # 단어 단위 토큰화
        for word in nltk.word_tokenize(sentence.strip()):
            # 명사 추출
            nouns = k.nouns(word)

            for noun in nouns:
                # 불용어 제거
                if NewsStopWord(noun):
                    continue

                if noun in nouns_cnt.keys():
                    nouns_cnt[noun] += 1
                else:
                    nouns_cnt[noun] = 1

                if a_noun_max_cnt < nouns_cnt[noun]:
                    a_noun_max_cnt = nouns_cnt[noun]

    for noun in nouns_cnt.keys():
        a_nouns_tf[noun] = k_ratio + (1 - k_ratio) * nouns_cnt[noun] / a_noun_max_cnt

    return a_nouns_tf

In [10]:
# def compute_tf(a_id, content):
#     a_nouns_tf[a_id] = {}
#     nouns_cnt = {}
#     a_noun_max_cnt[a_id] = 0

#     # 연속된 공백 및 개행 제거
#     content = re.sub(r'[\s]{2,}', ' ', content)
#     content = re.sub(r'[\n]{2,}', '\n', content)

#     # 문장 단위 토큰화
#     for idx, sentence in enumerate(nltk.sent_tokenize(content)):
#         nouns_temp = []

#         # 단어 단위 토큰화
#         for word in nltk.word_tokenize(sentence.strip()):
#             # 명사 추출
#             nouns = k.nouns(word)

#             for noun in nouns:
#                 # 불용어 제거
#                 if NewsStopWord(noun):
#                     continue

#                 if noun in nouns_cnt.keys():
#                     nouns_cnt[noun] += 1
#                 else:
#                     nouns_cnt[noun] = 1

#                 if a_noun_max_cnt[a_id] < nouns_cnt[noun]:
#                     a_noun_max_cnt[a_id] = nouns_cnt[noun]

# #             # 기본 불용어 제거
# #             for stop_word in stop_words:
# #                 nouns.remove(stop_word)

# #             nouns_temp.extend(nouns)
# #             nouns_temp = list(set(nouns_temp))

# #         unique_nouns.extend(nouns_temp)
# #         unique_nouns = list(set(unique_nouns))

#     for noun in nouns_cnt.keys():
#         a_nouns_tf[a_id][noun] = k_ratio + (1 - k_ratio) * nouns_cnt[noun] / a_noun_max_cnt[a_id]

## [Debug] Extract Nouns and Compute Term Frequency (TF)
---

In [15]:
# content = """제19호 태풍 ‘솔릭’이 제주도를 지나 북상하면서 광주 지역 모든 학교 학생들이 조기 하교 했다. 항공편은 모두 결항됐으며 무등산 입산이 통제됐다. 

#            23일 오후 광주 서구 하늘에 제19호 태풍 솔릭이 몰고온 먹구름이 가득하다.
          

# 23일 광주지방기상청에 따르면 제19호 태풍 ‘솔릭’은 이날 낮 12시 현재 제주 서귀포 서쪽 90km 부근 해상에서 시속 4km로 북진 하고 있다. 기상청은 태풍이 이날 오후 6시쯤 목포 서남쪽 80㎞ 해상을 지난뒤 오는 24일 새벽 전북 군산 인근으로 상륙할 것으로 보고 있다. 
# 솔릭이 예상보다 훨씬 느린 속도로 접근하면서 광주 시민들은 ‘조마조마’한 심정으로 피해 예방을 위해 총력을 다했다. 태풍의 영향으로 광주 지역에는 바람이 점차 강해지면서 이날 제주와 김포를 오가는 광주공항의 모든 항공편이 결항됐다. 무등산도 입산이 통제됐다. 
# 태풍이 접근하면서 광주시교육청은 전체 학교를 대상으로 ‘조기 하교’를 결정했다. 광주지역 유치원과 초·중·고는 오후 3시 이전에 조기 하교했다. 고등학교의 아간 자율학습도 금지됐다. 교육청은 학원에도 휴원을 적극 검토하고 하원 시간을 조정하도록 요청했다. 
# 광주·전남은 24일까지 100∼250㎜의 비가 내리고 해안과 지리산에는 400㎜ 넘게 내리는 곳도 있겠다. 광주·전남은 태풍의 중심에서 반경 25m 범위 안에 들어 바람도 강하게 불 것으로 기상청은 내다보고 있다."""

# a_nouns_tf = compute_tf(content)

# a_nouns_tf

## Insert Nouns and TFs into Term Table
---

* DB에는 2018-08-23만 기록 (2018-08-17 - 2018-08-23)

In [None]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# Date iteration (날짜별로 나눠서 처리)
for date in date_list[:7]:
    print(date)
    print('--------------------------------------------------\n')
    
    cur.execute("SELECT a_id, content FROM Article WHERE date = '{0}'".format(date))
    data = cur.fetchall()

    # Content iteration
    for data_idx, d in enumerate(data):
        if (data_idx % 5000) == 0:
            cur.execute("SELECT COUNT(*) FROM Term")
            t_size = cur.fetchone()[0]
            
            print('{0:6,} / {1:6,} | {2:13,}'.format(data_idx, len(data), t_size))
            
        # TF 계산
        a_nouns_tf = compute_tf(content=d[1])
        
        # Insert data
        for noun, tf in a_nouns_tf.items():
            record = (d[0], noun, tf)
            try:
                cur.execute("INSERT INTO Term(a_id, term, tf_article) \
                            VALUES(?,?,?)", record)
            except:
                pass
        else:
            conn.commit()

    print('\n--------------------------------------------------\n\n')

In [5]:
# # 에러 발생 또는 중간에 끊을 시 DB의 특정 데이터 삭제
# # date = '2018-08-13'

# for date in date_list[7:]:
#     conn = sqlite3.connect('db/news_db.db')
#     cur = conn.cursor()

#     cur.execute("DELETE FROM Term \
#                 WHERE EXISTS (SELECT * \
#                               FROM Article \
#                               WHERE Article.date = '{0}' AND Article.a_id = Term.a_id )".format(date))

#     conn.commit()

너무 오래걸려서 다른 컴퓨터로 돌려서 pickle로 저장해놨다가 합침.

In [13]:
# conn = sqlite3.connect('db/news_db.db')
# cur = conn.cursor()

# # Date iteration (날짜별로 나눠서 처리)
# for date in date_list[-4:-3]:
#     print(date)
#     print('--------------------------------------------------\n')
    
#     records = []
    
#     cur.execute("SELECT a_id, content FROM Article WHERE date = '{0}'".format(date))
#     data = cur.fetchall()

#     # Content iteration
#     for data_idx, d in enumerate(data):
#         if (data_idx % 5000) == 0:
#             cur.execute("SELECT COUNT(*) FROM Term")
#             t_size = cur.fetchone()[0]
            
#             print('{0:6,} / {1:6,} | {2:13,}'.format(data_idx, len(data), t_size))
            
#         # TF 계산
#         a_nouns_tf = compute_tf(content=d[1])
        
#         # Insert data
#         for noun, tf in a_nouns_tf.items():
#             record = (d[0], noun, tf)
#             records.append(record)
# #             try:
# #                 cur.execute("INSERT INTO Term(a_id, term, tf_article) \
# #                             VALUES(?,?,?)", record)
# #             except:
# #                 pass
# #         else:
# #             conn.commit()

#     with open('temp/records_{0}.pkl'.format(date), 'wb') as f:
#         pickle.dump(records, f)

#     print('\n--------------------------------------------------\n\n')

In [21]:
# [Debug]
date = '2018-08-13'

with open('temp/records_{0}.pkl'.format(date), 'rb') as f:
    records = pickle.load(f)

print('{0:,}\n'.format(len(records)))
    
for d in records[:10]:
    print(d)

1,256,343

('da_20180813235953261', '충남', 0.75)
('da_20180813235953261', '천안', 0.8125)
('da_20180813235953261', '현금', 1.0)
('da_20180813235953261', '수송', 0.875)
('da_20180813235953261', '현금수송업체', 0.5625)
('da_20180813235953261', '업체', 0.6875)
('da_20180813235953261', '직원', 0.625)
('da_20180813235953261', '보령', 0.625)
('da_20180813235953261', '검거', 0.625)
('da_20180813235953261', '천안서북경찰서', 0.6875)


In [12]:
# # Insert pickled data into Term table
# for date in date_list[-4:]:    # ['2018-08-13', '2018-08-12', '2018-08-11', '2018-08-10']
#     with open('temp/records_{0}.pkl'.format(date), 'rb') as f:
#         records = pickle.load(f)

#     print('{0} | {1}'.format(date, len(records))
#     print('--------------------------------------------------\n')
        
#     for record in records:
#         try:
#             cur.execute("INSERT INTO Term(a_id, term, tf_article) \
#                         VALUES(?,?,?)", record)
#         except:
#             pass
#     else:
#         conn.commit()
            
#     print('--------------------------------------------------\n\n')

**1200 Term 테이블을 사용해야 함. (현재 600짜리로 축소되어 있음.)**  
**1200 Term 테이블을 pickle로 저장해서 사용할 것**

## Extract Unique Nouns, Invert Index and
## Compute Inverse Document Frequency (IDF)
---

* Inverted index and IDF (전체)

In [15]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

inverted_idx = {}
unique_nouns_idf = {}

cur.execute("SELECT a_id, term FROM Term")

data = cur.fetchall()

# Noun iteration
for data_idx, d in enumerate(data):
    if (data_idx % 1000000) == 0:
        print('{0:10,} / {1:10,}'.format(data_idx, len(data)))
    
    noun = d[1]
    a_id = d[0]
    
    if noun in inverted_idx.keys():
        inverted_idx[noun].append(a_id)
    else:
        inverted_idx[noun] = []
        inverted_idx[noun].append(a_id)

cur.execute("SELECT COUNT(*) FROM Article")

a_size = cur.fetchone()[0]
        
for noun, a_ids in inverted_idx.items():
    df = len(a_ids)
    unique_nouns_idf[noun] = log10(a_size / df)
        
with open('db/inverted_index/inverted_index.pkl', 'wb') as f:
    pickle.dump(inverted_idx, f)
    
with open('db/unique_nouns_idf/unique_nouns_idf.pkl', 'wb') as f:
    pickle.dump(unique_nouns_idf, f)

In [16]:
# [Debug] Inverted index
unique_noun = tuple(inverted_idx.keys())[0]

print('The number of unique nouns: ', len(inverted_idx), '\n')
print(unique_noun)
print('DF: ', len(inverted_idx[unique_noun]))
inverted_idx[unique_noun][:10]

The number of unique nouns:  382153 

화재
DF:  3866


['da_20180823235953274',
 'da_20180823224951448',
 'da_20180823224523381',
 'da_20180823213907297',
 'da_20180823213513245',
 'da_20180823212259097',
 'da_20180823212032071',
 'da_20180823211324987',
 'da_20180823205613743',
 'da_20180823202638291']

In [17]:
# [Debug] Unique nouns IDF
unique_nouns = tuple(unique_nouns_idf.keys())[:10]

print('The number of IDF unique nouns: ', len(unique_nouns_idf), '\n')
for unique_noun in unique_nouns:
    print('{0} | {1:5.3}'.format(unique_noun, unique_nouns_idf[unique_noun]))

The number of IDF unique nouns:  382153 

화재 |  1.55
화재원인 |  2.68
원인 |  1.27
어디 |   1.7
인천 |  1.43
윤태현 |   3.4
태현 |  3.01
인천시 |  2.13
남동 |  2.16
남동구 |  2.53


* Inverted index and IDF (기간)

In [18]:
# [Debug]
for date_idx in range(len(date_list)-6):
    print(date_list[date_idx:date_idx+7])

['2018-08-23', '2018-08-22', '2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17']
['2018-08-22', '2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16']
['2018-08-21', '2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15']
['2018-08-20', '2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14']
['2018-08-19', '2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13']
['2018-08-18', '2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12']
['2018-08-17', '2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-11']
['2018-08-16', '2018-08-15', '2018-08-14', '2018-08-13', '2018-08-12', '2018-08-11', '2018-08-10']


In [20]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# Date (기간) iteration
for date_idx in range(len(date_list)-6):
    inverted_idx = {}
    unique_nouns_idf = {}
    
    data = []
    a_size = 0
    
    # Date (기간에 속하는 각 날짜) iteration
    for date in date_list[date_idx:date_idx+7]:
        cur.execute("SELECT T.a_id, T.term \
                    From Article A, Term T \
                    WHERE A.a_id = T.a_id AND A.date = '{0}'".format(date))

        data.extend(cur.fetchall())
        
        cur.execute("SELECT COUNT(*) FROM Article WHERE date = '{0}'".format(date))

        a_size += cur.fetchone()[0]

    print('{0} | {1:10,} | {2:10,}'.format(date_list[date_idx], len(data), a_size))
        
    # Noun iteration
    for data_idx, d in enumerate(data):
        if (data_idx % 1000000) == 0:
            print('{0:10,} / {1:10,}'.format(data_idx, len(data)))

        noun = d[1]
        a_id = d[0]

        if noun in inverted_idx.keys():
            inverted_idx[noun].append(a_id)
        else:
            inverted_idx[noun] = []
            inverted_idx[noun].append(a_id)
            
    for noun, a_ids in inverted_idx.items():
        df = len(a_ids)
        unique_nouns_idf[noun] = log10(a_size / df)

    with open('db/inverted_index/inverted_index_' + date_list[date_idx] + '.pkl', 'wb') as f:
        pickle.dump(inverted_idx, f)
        
    with open('db/unique_nouns_idf/unique_nouns_idf_' + date_list[date_idx] + '.pkl', 'wb') as f:
        pickle.dump(unique_nouns_idf, f)

2018-08-23 |  6,680,675 |     71,899
         0 /  6,680,675
 1,000,000 /  6,680,675
 2,000,000 /  6,680,675
 3,000,000 /  6,680,675
 4,000,000 /  6,680,675
 5,000,000 /  6,680,675
 6,000,000 /  6,680,675
2018-08-22 |  6,720,968 |     72,354
         0 /  6,720,968
 1,000,000 /  6,720,968
 2,000,000 /  6,720,968
 3,000,000 /  6,720,968
 4,000,000 /  6,720,968
 5,000,000 /  6,720,968
 6,000,000 /  6,720,968
2018-08-21 |  6,128,741 |     66,111
         0 /  6,128,741
 1,000,000 /  6,128,741
 2,000,000 /  6,128,741
 3,000,000 /  6,128,741
 4,000,000 /  6,128,741
 5,000,000 /  6,128,741
 6,000,000 /  6,128,741
2018-08-20 |  6,072,147 |     65,990
         0 /  6,072,147
 1,000,000 /  6,072,147
 2,000,000 /  6,072,147
 3,000,000 /  6,072,147
 4,000,000 /  6,072,147
 5,000,000 /  6,072,147
 6,000,000 /  6,072,147
2018-08-19 |  6,153,353 |     66,748
         0 /  6,153,353
 1,000,000 /  6,153,353
 2,000,000 /  6,153,353
 3,000,000 /  6,153,353
 4,000,000 /  6,153,353
 5,000,000 /  6,153,353

In [21]:
# [Debug] Inverted index
date = '2018-08-23'

with open('db/inverted_index/inverted_index_{0}.pkl'.format(date), 'rb') as f:
    inverted_idx = pickle.load(f)

unique_noun = tuple(inverted_idx.keys())[0]

print('The number of unique nouns: ', len(inverted_idx), '\n')
print(unique_noun)
print('DF: ', len(inverted_idx[unique_noun]))
inverted_idx[unique_noun][:10]

The number of unique nouns:  258680 

화재
DF:  1703


['da_20180823235953274',
 'da_20180823224951448',
 'da_20180823224523381',
 'da_20180823213907297',
 'da_20180823213513245',
 'da_20180823212259097',
 'da_20180823212032071',
 'da_20180823211324987',
 'da_20180823205613743',
 'da_20180823202638291']

In [22]:
# [Debug] Unique nouns IDF
date = '2018-08-23'

with open('db/unique_nouns_idf/unique_nouns_idf_{0}.pkl'.format(date), 'rb') as f:
    unique_nouns_idf = pickle.load(f)
    
unique_nouns = tuple(unique_nouns_idf.keys())[:10]

print('The number of IDF unique nouns: ', len(unique_nouns_idf), '\n')
for unique_noun in unique_nouns:
    print('{0} | {1:5.3}'.format(unique_noun, unique_nouns_idf[unique_noun]))

The number of IDF unique nouns:  258680 

화재 |  1.63
화재원인 |  2.71
원인 |  1.27
어디 |  1.71
인천 |  1.39
윤태현 |  3.39
태현 |   3.0
인천시 |  2.04
남동 |  2.02
남동구 |  2.32


* Inverted index and IDF (section + 기간)

In [24]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# Section interation
for section in section_list:
    # Date (기간) iteration
    for date_idx in range(len(date_list)-6):
        inverted_idx = {}
        unique_nouns_idf = {}
        
        data = []
        a_size = 0
    
        # Date (기간에 속하는 각 날짜) iteration
        for date in date_list[date_idx:date_idx+7]:
            cur.execute("SELECT T.a_id, T.term \
                        From Article A, Term T \
                        WHERE A.a_id = T.a_id AND A.section = '{0}' AND A.date = '{1}'".format(section_dict[section], date))

            data.extend(cur.fetchall())
            
            cur.execute("SELECT COUNT(*) FROM Article WHERE section = '{0}' AND date = '{1}'".format(section_dict[section], date))

            a_size += cur.fetchone()[0]

        print('{0} | {1} | {2:10,} | {3:10,}'.format(section, date_list[date_idx], len(data), a_size))

        # Noun iteration
        for data_idx, d in enumerate(data):
#             if (data_idx % 100000) == 0:
#                 print('{0:10,} / {1:10,}'.format(data_idx, len(data)))

            noun = d[1]
            a_id = d[0]

            if noun in inverted_idx.keys():
                inverted_idx[noun].append(a_id)
            else:
                inverted_idx[noun] = []
                inverted_idx[noun].append(a_id)
                
        for noun, a_ids in inverted_idx.items():
            df = len(a_ids)
            unique_nouns_idf[noun] = log10(a_size / df)

        with open('db/inverted_index/inverted_index_' + section + '_' + date_list[date_idx] + '.pkl', 'wb') as f:
            pickle.dump(inverted_idx, f)
            
        with open('db/unique_nouns_idf/unique_nouns_idf_' + section + '_' + date_list[date_idx] + '.pkl', 'wb') as f:
            pickle.dump(unique_nouns_idf, f)

society | 2018-08-23 |  2,461,835 |     28,067
society | 2018-08-22 |  2,400,452 |     27,602
society | 2018-08-21 |  2,174,006 |     25,113
society | 2018-08-20 |  2,136,096 |     24,854
society | 2018-08-19 |  2,156,734 |     25,098
society | 2018-08-18 |  2,164,869 |     25,250
society | 2018-08-17 |  2,149,636 |     25,200
society | 2018-08-16 |  2,128,529 |     25,246
politics | 2018-08-23 |    875,952 |      8,721
politics | 2018-08-22 |    917,709 |      9,096
politics | 2018-08-21 |    878,862 |      8,614
politics | 2018-08-20 |    859,569 |      8,477
politics | 2018-08-19 |    867,782 |      8,595
politics | 2018-08-18 |    852,074 |      8,419
politics | 2018-08-17 |    852,605 |      8,483
politics | 2018-08-16 |    870,502 |      8,625
economic | 2018-08-23 |  2,018,170 |     20,532
economic | 2018-08-22 |  2,096,314 |     21,496
economic | 2018-08-21 |  1,862,044 |     19,197
economic | 2018-08-20 |  1,887,706 |     19,833
economic | 2018-08-19 |  1,943,195 |     20,493


In [9]:
# [Debug]
section = 'society'
date = '2018-08-23'

with open('db/inverted_index/inverted_index_{0}_{1}.pkl'.format(section, date), 'rb') as f:
    inverted_idx = pickle.load(f)
    
unique_noun = tuple(inverted_idx.keys())[0]

print('The number of unique nouns: ', len(inverted_idx), '\n')
print(unique_noun)
print('DF: ', len(inverted_idx[unique_noun]))
inverted_idx[unique_noun][:10]

The number of unique nouns:  143850 

화재
DF:  1089


['da_20180823235953274',
 'da_20180823224951448',
 'da_20180823224523381',
 'da_20180823213907297',
 'da_20180823213513245',
 'da_20180823212259097',
 'da_20180823212032071',
 'da_20180823211324987',
 'da_20180823205613743',
 'da_20180823202638291']

In [26]:
# [Debug] Unique nouns IDF
section = 'society'
date = '2018-08-23'

with open('db/unique_nouns_idf/unique_nouns_idf_{0}_{1}.pkl'.format(section, date), 'rb') as f:
    unique_nouns_idf = pickle.load(f)
    
unique_nouns = tuple(unique_nouns_idf.keys())[:10]

print('The number of IDF unique nouns: ', len(unique_nouns_idf), '\n')
for unique_noun in unique_nouns:
    print('{0} | {1:5.3}'.format(unique_noun, unique_nouns_idf[unique_noun]))

The number of IDF unique nouns:  143850 

화재 |  1.41
화재원인 |  2.44
원인 |  1.21
어디 |  1.87
인천 |  1.21
윤태현 |  3.11
태현 |  3.02
인천시 |  1.72
남동 |  1.74
남동구 |  1.95


## Insert TF-IDFs into Term Table
---

* 일단 별도의 테이블에 1,200만 단어 전부 저장

In [4]:
# # Article table 삭제
# conn = sqlite3.connect('db/news_db_Term1200.db')
# cur = conn.cursor()

# cur.execute("DROP TABLE Article")
# conn.commit()

In [10]:
conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

with open('db/unique_nouns_idf/unique_nouns_idf.pkl', 'rb') as f:
    unique_nouns_idf = pickle.load(f)

cur.execute("SELECT a_t_id, term, tf_article FROM Term")
data = cur.fetchall()

for d in data:       
    a_t_id = d[0]
    noun = d[1]
    tf = d[2]
    idf = unique_nouns_idf[noun]

    if ((a_t_id - 1) % 1000000) == 0:
        print('{0:10,} / {1:10,}'.format(a_t_id - 1, len(data)))

#     print(a_t_id, tf * idf)
    try:
        cur.execute("UPDATE Term \
                    SET tfidf = {0} \
                    WHERE a_t_id = {1}"
                    .format(tf * idf, a_t_id))
    except:
        pass
else:
    conn.commit()

         0 /  6,680,675
 1,000,000 /  6,680,675
 2,000,000 /  6,680,675
 3,000,000 /  6,680,675
 4,000,000 /  6,680,675
 5,000,000 /  6,680,675
 6,000,000 /  6,680,675


* DB에는 2018-08-23만 기록 (2018-08-17 - 2018-08-23)
* 2주치 IDF 사용


※ UPDATE 쓰지 말자. 되도록이면 한 번에 INSERT. 특히 records 수가 많다면 WHERE 쓸 생각 절대 하지 말 것.

In [7]:
conn = sqlite3.connect('db/news_db_Term1200.db')
cur = conn.cursor()

# Date iteration
for date in date_list:
    print(date)
    print('--------------------------------------------------\n')
    
    pkl_Term_i = {}

    cur.execute("SELECT a_id FROM Article \
                 WHERE date = '{0}'".format(date))
    a_ids = cur.fetchall()
    
    for a_id_idx, a_id in enumerate(a_ids):
        if (a_id_idx % 1000) == 0:
            print('{0:10,} / {1:10,}'.format(a_id_idx, len(a_ids)))

        cur.execute("SELECT * \
                    FROM Term \
                    WHERE a_id = '{0}'".format(a_id[0]))
        data = cur.fetchall()

        for d in data:
            a_t_id = d[0]
            cols = d[1:]

        pkl_Term_i[a_t_id] = cols
                
    print('--------------------------------------------------\n\n')

2018-08-23
--------------------------------------------------

         0 /     13,471


KeyboardInterrupt: 

In [6]:
# conn = sqlite3.connect('db/news_db.db')
# cur = conn.cursor()

# # Date iteration
# for date in date_list[:7]:
#     print(date)
#     print('--------------------------------------------------\n')

#     unique_nouns_idf = {}
#     with open('db/unique_nouns_idf/unique_nouns_idf_{0}.pkl'.format(date), 'rb') as f:
#         unique_nouns_idf = pickle.load(f)

#     # TODO: inverted_idx에서 a_ids 가져오는 것으로 대체
#     cur.execute("SELECT a_id FROM Article \
#                  WHERE date = '{0}'".format(date))
#     a_ids = cur.fetchall()

#     for a_id_idx, a_id in enumerate(a_ids):
#         if (a_id_idx % 1000) == 0:
#             print('{0:10,} / {1:10,}'.format(a_id_idx, len(a_ids)))

#         cur.execute("SELECT T.a_t_id, T.term, T.tf_article \
#                     FROM Term T \
#                     WHERE T.a_id = '{0}'".format(a_id[0]))
#         data = cur.fetchall()

#         for d in data:
#             a_t_id = d[0]
#             noun = d[1]
#             tf = d[2]
#             idf = unique_nouns_idf[noun]

# #             print(a_t_id, tf * idf)
#             try:
#                 cur.execute("UPDATE Term \
#                             SET tfidf = {0} \
#                             WHERE a_t_id = {1}"
#                             .format(tf * idf, a_t_id))
#             except:
#                 pass
#         else:
#             conn.commit()
            
#     print('--------------------------------------------------\n\n')

In [29]:
# # [Debug]
# conn = sqlite3.connect('db/daum.db')
# cur = conn.cursor()

# unique_nouns_idf = {'태풍': 1., '솔릭': 100., '제주': 10000.}

# for unique_noun_idx, item in enumerate(unique_nouns_idf.items()):
#     if (unique_noun_idx % 10000) == 0:
#         print('{0:10,} / {1:10,}'.format(unique_noun_idx, len(unique_nouns_idf)))
    
#     unique_noun = item[0]
#     idf = item[1]
#     print(unique_noun, idf)
    
#     cur.execute("SELECT a_t_id, tf_article FROM Term WHERE term = '{0}'".format(unique_noun))

#     data = cur.fetchall()
    
#     for d in data:
#         a_t_id = d[0]
#         tf = d[1]
        
#         cur.execute("UPDATE Term \
#                     SET tfidf = {0} \
#                     WHERE a_t_id = {1}"
#                     .format(tf * idf, a_t_id))
#     else:
#         conn.commit()

* 기간(1주일)마다 pickle 저장

In [14]:
# conn = sqlite3.connect('db/news_db.db')
# cur = conn.cursor()

# # Date (기간) iteration
# for date_idx in range(len(date_list)-6):   
#     unique_nouns_idf = {}
#     with open('db/unique_nouns_idf/unique_nouns_idf_{0}.pkl'.format(date_list[date_idx]), 'rb') as f:
#         unique_nouns_idf = pickle.load(f)
#     pkl_Term_i = {}
    
#     print(date_list[date_idx])

#     for unique_noun_idx, item in enumerate(unique_nouns_idf.items()):
#         if (unique_noun_idx % 10000) == 0:
#             print('{0:10,} / {1:10,}'.format(unique_noun_idx, len(unique_nouns_idf)))

#         unique_noun = item[0]
#         idf = item[1]

#         cur.execute("SELECT T.a_t_id, T.a_id, T.term, T.tf_article \
#                     FROM Article A, Term T \
#                     WHERE A.a_id = T.a_id AND A.date = '{0}' AND T.term = '{1}'".format(date, unique_noun))

#         data = cur.fetchall()

#         for d in data:
#             a_t_id = d[0]
#             a_id = d[1]
#             noun = d[2]
#             tf = d[3]
#             tfidf = tf * idf

#             pkl_Term_i[a_t_id] = (a_id, noun, tf, tfidf)
                
#     with open('db/Term/Term_' + date_list[date_idx] + '.pkl', 'wb') as f:
#         pickle.dump(pkl_Term_i, f)

2018-08-23
         0 /        944
2018-08-22
         0 /      1,082
2018-08-21
         0 /      1,170
2018-08-20
         0 /      1,267
2018-08-19
         0 /      1,213
2018-08-18
         0 /      1,198
2018-08-17
         0 /      1,336
2018-08-16
         0 /      1,330


In [22]:
# [Debug] Term_i TF-IDF
date = '2018-08-23'

with open('db/Term/Term_{0}.pkl'.format(date), 'rb') as f:
    pkl_Term_i = pickle.load(f)
    
a_t_ids = tuple(pkl_Term_i.keys())[:10]

print('The number of nouns: ', len(pkl_Term_i), '\n')
print('a_t_id | [a_id, term, tf_article, tfidf]')
print('--------------------------------------------------')
for a_t_id in a_t_ids:
    print('{0} | {1}'.format(a_t_id, pkl_Term_i[a_t_id]))

The number of nouns:  54 

a_t_id | [a_id, term, tf_article, tfidf]
--------------------------------------------------
1587 | ['da_20180815235603323', '현장', 0.55, 2.502891222417879]
1769 | ['da_20180815235400307', '경찰', 0.5714285714285714, 2.2895104395067296]
1510 | ['da_20180815235603323', '사건', 0.55, 2.668457720033069]
1638 | ['da_20180815235510313', '사건', 0.7142857142857143, 3.465529506536453]
1715 | ['da_20180815235400307', '처리', 0.5714285714285714, 2.772423605229162]
1696 | ['da_20180815235400307', '조사', 1.0, 4.07359005876739]
1656 | ['da_20180815235510313', '중인', 0.5714285714285714, 2.6004064648497445]
1641 | ['da_20180815235510313', '수사', 0.6428571428571428, 2.81225574927731]
1712 | ['da_20180815235400307', '수사', 0.5714285714285714, 2.499782888246498]
1758 | ['da_20180815235400307', '건물', 0.5714285714285714, 2.772423605229162]


## Insert Unique Nouns and IDFs into U_Term Table
---

* DB에는 2018-08-23만 기록 (2018-08-17 - 2018-08-23)
* 2주치 IDF 사용

In [None]:
date = '2018-08-23'

conn = sqlite3.connect('db/news_db.db')
cur = conn.cursor()

# 전체 unique nouns
with open('db/unique_nouns_idf/unique_nouns_idf.pkl', 'rb') as f:
    unique_nouns_idf = pickle.load(f)

# 기간별 unique nouns
with open('db/unique_nouns_idf/unique_nouns_idf_{0}.pkl'.format(date), 'rb') as f:
    unique_nouns_idf_clip = pickle.load(f)
    
# Insert data
for unique_noun, idf in unique_nouns_idf.items():
    # 기간별 unique nouns에 존재하면 noun과 idf (2주치 IDF) 모두 insert
    if unique_noun in unique_nouns_idf_clip.keys():
        record = (unique_noun, idf)
        try:
            cur.execute("INSERT INTO U_Term(u_term, idf) \
                        VALUES(?,?)", record)
        except:
            pass
#     # 그렇지 않을 경우 noun만 insert
#     else:
#         try:
#             cur.execute("INSERT INTO U_Term(u_term) \
#                         VALUES('{0}')".format(unique_noun))
#         except:
#             pass
else:
    conn.commit()

* 기간(1주일)마다 pickle 저장 (앞에서 이미 저장함.)
   * Path: db/unique_nouns_idf/unique_nouns_idf_**[기준 날짜]**.pkl
      * e.g. db/unique_nouns_idf/unique_nouns_idf_2018-08-23.pkl

# TODO: