# Preprocessing
---

In [1]:
import pandas as pd
import re
import sqlite3

In [2]:
press_list = ['daum', 'naver',
              'seoulilbo', 'dtoday', 'asiailbo', 'labortoday', 'm-i',
              'ekn', 'busan', 'imaeil', 'kookje', 'yeongnam']
press_dict = {'daum':'다음', 'naver':'네이버',
              'seoulilbo':'서울일보', 'dtoday':'일간투데이', 'asiailbo':'아시아일보',
              'labortoday':'매일노동뉴스', 'm-i':'매일일보', 'ekn':'에너지경제',
              'busan':'부산일보', 'imaeil':'매일신문', 'kookje':'국제신문',
              'yeongnam':'영남일보'}

daum_press_list = ['EBS', 'IT동아', 'JTBC', 'KBS', 'KTV',
                   'MBC', 'MBN', 'SBS', 'SBS CNBC', 'YTN',
                   'ZDNet Korea', 'bnt뉴스', '게임동아', '게임톡', '경향신문',
                   '국민일보', '기자협회보', '노컷뉴스', '뉴스1', '뉴시스',
                   '데일리안', '동아사이언스', '동아일보', '디지털타임스', '로이터',
                   '매경게임진', '매일경제', '머니S', '머니투데이', '문화일보',
                   '미디어오늘', '서울경제', '서울신문', '세계일보', '아시아경제',
                   '아이뉴스24', '연합뉴스', '연합뉴스TV', '오마이뉴스', '오토타임즈',
                   '이데일리', '전자신문', '조선비즈', '조선일보', '중앙일보',
                   '채널A', '코리아헤럴드', '쿠키뉴스', '파이낸셜뉴스', '포토친구',
                   '프레시안', '한겨레', '한국경제', '한국경제TV', '한국일보',
                   '헤럴드경제']
naver_press_list = ['JTBC', 'KBS', 'MBC', 'MBN', 'SBS',
                    'SBS CNBC', 'TV조선', 'YTN', 'ZDNet Korea', '강원일보',
                    '경향신문', '국민일보', '기자협회보', '노컷뉴스', '뉴스1',
                    '뉴시스', '데일리안', '동아사이언스', '동아일보', '디지털데일리',
                    '디지털타임스', '로이터', '매일경제', '매일신문', '머니S',
                    '머니투데이', '문화일보', '미디어오늘', '부산일보', '블로터',
                    '서울경제', '서울신문', '세계일보', '아시아경제', '아이뉴스24',
                    '여성신문', '연합뉴스', '연합뉴스TV', '오마이뉴스', '이데일리',
                    '일다', '전자신문', '조선비즈', '조선일보', '조세일보',
                    '중앙일보', '참세상', '채널A', '코리아헤럴드', '코메디닷컴',
                    '파이낸셜뉴스', '프레시안', '한겨레', '한국경제', '한국경제TV',
                    '한국일보', '헤럴드경제', '헬스조선']

section_list = ['society', 'politics', 'economic', 'culture', 'digital', 'global']
section_dict = {'society':'사회', 'politics':'정치', 'economic':'경제',
               'culture':'문화', 'digital':'IT', 'global':'세계'}

base_urls = {'daum':
             {'society':'http://media.daum.net/breakingnews/society',
              'politics':'http://media.daum.net/breakingnews/politics',
              'economic':'http://media.daum.net/breakingnews/economic',
              'culture':'http://media.daum.net/breakingnews/culture',
              'digital':'http://media.daum.net/breakingnews/digital',
              'global':'http://media.daum.net/breakingnews/foreign'
             },
             'seoulilbo':
             {'society':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N10&view_type=sm',
              'politics':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N8&view_type=sm',
              'economic':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N9&view_type=sm',
              'culture':'http://www.seoulilbo.com/news/articleList.html?sc_section_code=S1N11&view_type=sm',
              'digital':'',
              'global':''
             },
             'dtoday':
             {'society':'',
              'politics':'http://www.dtoday.co.kr/news/articleList.html?sc_section_code=S1N1&view_type=sm',
              'economic':'http://www.dtoday.co.kr/news/articleList.html?sc_section_code=S1N2&view_type=sm',
              'culture':'',
              'digital':'',
              'global':''
             },
             'asiailbo':
             {'society':'http://www.asiailbo.co.kr/etnews/?cid=21030000',
              'politics':'http://www.asiailbo.co.kr/etnews/?cid=21010000',
              'economic':'http://www.asiailbo.co.kr/etnews/?cid=21020000',
              'culture':'http://www.asiailbo.co.kr/etnews/?cid=21040000',
              'digital':'',
              'global':''
             },
             'labortoday':
             {'society':'http://www.labortoday.co.kr/news/articleList.html?sc_section_code=S1N3&view_type=sm',
              'politics':'http://www.labortoday.co.kr/news/articleList.html?sc_section_code=S1N2&view_type=sm',
              'economic':'',    # 정치, 경제
              'culture':'',
              'digital':'',
              'global':''
             },
             'm-i':
             {'society':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N3&view_type=sm',
              'politics':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N1&view_type=tm',
              'economic':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N2&view_type=sm',
              'culture':'http://www.m-i.kr/news/articleList.html?sc_section_code=S1N22&view_type=tm',
              'digital':'',
              'global':''
             },
             'ekn':
             {'society':'http://www.ekn.kr/news/section_list_all.html?sec_no=25',
              'politics':'',    # 정치, 사회
              'economic':'http://www.ekn.kr/news/section_list_all.html?sec_no=130',
              'culture':'',
              'digital':'',
              'global':''
             },
             'busan':
             {'society':'http://news20.busan.com/news/social.jsp',
              'politics':'http://news20.busan.com/news/politics.jsp',
              'economic':'http://news20.busan.com/EconomyAndOcean/econocean.jsp',
              'culture':'http://news20.busan.com/news/culture.jsp',
              'digital':'',
              'global':''
             },
             'imaeil':
             {'society':'http://news.imaeil.com/SocietyAll/',
              'politics':'http://news.imaeil.com/PoliticsAll/',
              'economic':'http://news.imaeil.com/EconomyAll/',
              'culture':'http://news.imaeil.com/CultureAll/',
              'digital':'',
              'global':'http://news.imaeil.com/InternationalAll/'
             },
             'kookje':
             {'society':'http://www.kookje.co.kr/sub.htm?code=0300&vHeadTitle=%BB%E7%C8%B8',
              'politics':'http://www.kookje.co.kr/sub.htm?code=0100&vHeadTitle=%C1%A4%C4%A1',
              'economic':'http://www.kookje.co.kr/sub.htm?code=0200&vHeadTitle=%B0%E6%C1%A6',
              'culture':'http://www.kookje.co.kr/sub.htm?code=0500&vHeadTitle=%B9%AE%C8%AD',
              'digital':'http://www.kookje.co.kr/sub.htm?code=0800&vHeadTitle=IT%B0%FA%C7%D0',
              'global':'http://www.kookje.co.kr/sub.htm?code=0400&vHeadTitle=%B1%B9%C1%A6'
             },
             'yeongnam':
             {'society':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=04',
              'politics':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=02',
              'economic':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=03',
              'culture':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=08',
              'digital':'',    # 교육, 과학
              'global':'http://www.yeongnam.com/mnews/newsview.do?mode=subMain&cId=06'
             }
            }

headers = {"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}

In [3]:
reg_ex = {'email':r'[a-zA-Z0-9.!#$%&\'*+/=?^_`{|}~-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'}

In [4]:
# # Delete
# cur.execute("DELETE FROM daum WHERE sections LIKE '세계'")

# conn.commit()

In [5]:
# # Rename column
# cur.execute("ALTER TABLE daum RENAME TO daum_temp")

# cur.execute("CREATE TABLE daum(a_ids TEXT primary key, \
#             dates DATE, times TIME, titles TEXT, contents TEXT, \
#             press TEXT, authors TEXT, sections TEXT, urls TEXT)")

# cur.execute("INSERT INTO daum (a_ids, dates, times, titles, contents, press, authors, sections, urls) \
#             SELECT a_ids, dates, times, titles, contents, press_ko, authors, section_ko, urls \
#             FROM daum_temp")

# conn.commit()

## Null Data 제거
---

* dates, times, titles, press column이 NULL 또는 빈 문자열인 경우

In [6]:
def delete_null_data(db_name, table_name):
    conn = sqlite3.connect('db/' + db_name + '.db')
    cur = conn.cursor()
    
    cols = ('dates', 'times', 'titles', 'press')
    for col in cols:
        cur.execute("DELETE FROM {0} WHERE {1} IS NULL OR TRIM({1}) = ''".format(table_name, col))

    conn.commit()

In [7]:
delete_null_data('daum', 'daum')

In [8]:
delete_null_data('naver', 'naver')

## Daum, Naver에서 리스트에 없는 언론사 제외
---

In [9]:
def delete_excluded_press(db_name, table_name, press_list):
    conn = sqlite3.connect('db/' + db_name + '.db')
    cur = conn.cursor()
    
    cur.execute("DELETE FROM {0} WHERE press NOT IN {1}".format(table_name, tuple(press_list)))

    conn.commit()

In [10]:
delete_excluded_press('daum', 'daum', daum_press_list)

In [11]:
delete_excluded_press('naver', 'naver', naver_press_list)

## Contents 길이가 100 미만일 경우 제거
---

In [12]:
# Contents 길이 분석
def analysis_contents_length(db_name, table_name):
    conn = sqlite3.connect('db/' + db_name + '.db')
    cur = conn.cursor()
    
    cur.execute("SELECT contents FROM " + table_name)

    temp_contents_list = []
    for content_tuple in cur.fetchall():
        temp_contents_list.append(content_tuple[0])

    df = pd.DataFrame({'contents':temp_contents_list})
    
    df['contents_length'] = None
    for idx, content in enumerate(df.contents):
        df.contents_length.iloc[idx] = len(content)
    
    return df

In [13]:
df = analysis_contents_length('daum', 'daum')

df.head()    

Unnamed: 0,contents,contents_length
0,이런 가운데 인천 남동공단 화재로 숨진 직원들의 안타까운 사연들도 알려졌습니다. \...,830
1,한층 강해진 파도 (서귀포=연합뉴스) 박지호 기자 = 제19호 태풍 솔릭의 접근으로...,1775
2,【목포=뉴시스】 신대희 기자 = 전남 목포경찰서는 22일 함께 낚시하던 지인을 흉기...,351
3,[한국경제TV 김현경 기자]\n\n\n\n제19호 태풍 '솔릭'이 제주 서귀포 남쪽...,818
4,<앵커>\n박근혜 정부 시절 사법부가 '일제 강제징용 피해자 재판'에 개입했다는 의...,744


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1354 entries, 0 to 1353
Data columns (total 2 columns):
contents           1354 non-null object
contents_length    1354 non-null int64
dtypes: int64(1), object(1)
memory usage: 21.2+ KB


In [15]:
df.describe(percentiles=(0.01, 0.02, 0.03, 0.04, 0.05, 0.1))

Unnamed: 0,contents_length
count,1354.0
mean,845.5
std,748.710942
min,0.0
1%,15.01
2%,63.42
3%,84.0
4%,91.0
5%,95.0
10%,120.3


In [16]:
df = analysis_contents_length('naver', 'naver')

df.head()

Unnamed: 0,contents,contents_length
0,"고용쇼크 속 장하성-김동연 이어지는 불화설\n靑 ""실적 보이라는 경고""…張, 마지막...",1945
1,문재인 대통령은 20일 대통령 소속 군사망사고진상규명위원회 위원장에 이인람 법무법인...,665
2,【서울=뉴시스】 문재인 대통령은 20일 군사망사고진상규명위원회 위원장으로 이인람(6...,267
3,[ 손성태 기자 ]\n\n문재인 대통령은 20일 대통령 소속 군사망사고진상규명위원회...,567
4,"문 대통령 ""고용 악화 마음 무겁다""\n경제팀 경질·정책 수정은 '일축'\n\n[ ...",1454


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2357 entries, 0 to 2356
Data columns (total 2 columns):
contents           2357 non-null object
contents_length    2357 non-null int64
dtypes: int64(1), object(1)
memory usage: 36.9+ KB


In [18]:
df.describe(percentiles=(0.01, 0.02, 0.03, 0.04, 0.05, 0.1))

Unnamed: 0,contents_length
count,2357.0
mean,900.074671
std,654.317877
min,109.0
1%,155.0
2%,168.12
3%,177.0
4%,185.24
5%,195.0
10%,259.6


In [19]:
def delete_short_contents(db_name, table_name, threshold_len):
    conn = sqlite3.connect('db/' + db_name + '.db')
    cur = conn.cursor()
    
    cur.execute("DELETE FROM {0} WHERE LENGTH(contents) < {1}".format(table_name, threshold_len))

    conn.commit()

In [20]:
delete_short_contents('daum', 'daum', 100)

df = analysis_contents_length('daum', 'daum')

df.describe()

Unnamed: 0,contents_length
count,1274.0
mean,894.332025
std,745.205797
min,100.0
25%,387.25
50%,759.0
75%,1164.0
max,8172.0


In [21]:
delete_short_contents('naver', 'naver', 100)

df = analysis_contents_length('naver', 'naver')

df.describe()

Unnamed: 0,contents_length
count,2357.0
mean,900.074671
std,654.317877
min,109.0
25%,429.0
50%,758.0
75%,1183.0
max,10210.0


## Contents가 같을 경우 제거
---

In [22]:
conn = sqlite3.connect('db/daum.db')
cur = conn.cursor()

cur.execute("SELECT DISTINCT contents FROM daum")

len(cur.fetchall())

1214

In [23]:
conn = sqlite3.connect('db/naver.db')
cur = conn.cursor()

cur.execute("SELECT DISTINCT contents FROM naver")

len(cur.fetchall())

2357

In [24]:
def delete_dupl_contents(db_name, table_name):
    conn = sqlite3.connect('db/' + db_name + '.db')
    cur = conn.cursor()
    
    cur.execute("DELETE FROM {0} WHERE a_ids NOT IN (SELECT min(a_ids) FROM {0} GROUP BY contents)"
                .format(table_name))

    conn.commit()

In [25]:
delete_dupl_contents('daum', 'daum')

In [26]:
delete_dupl_contents('naver', 'naver')