# NEWS_csv

## module load

In [1]:
import re
import os
import kss
import json
import random
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from konlpy.tag import Mecab
m = Mecab()

## csv 파일 로드 & 나누기

In [2]:
path = "./NEWS_csv/"
files = os.listdir(path)
files.sort()
files = list(filter(lambda x: '.csv' in x, files))
files

['news_body_2018.csv',
 'news_body_2019_1.csv',
 'news_body_2019_2.csv',
 'news_body_2019_3.csv',
 'news_body_2019_4.csv',
 'news_body_2020_1.csv',
 'news_body_2020_2.csv',
 'news_body_2020_3.csv',
 'news_body_2020_4.csv',
 'news_body_2021.csv']

## 영어 기사 필터링 함수

In [3]:
def en_article_filter(df):
    ko_articles = df[df['body'].str.count('[a-z|A-Z|\s|\d|\,|\.|\'|\(|\)|+|-|\/|=|-|\[|\]|\:]{100,}') == 0]
    return ko_articles

## 기사 내용 전처리 함수

In [4]:
def content_proc(text):
    reporter_with_mail = '[가-힣]{2,4}(\s?){1,2}(기자)?(\s?){1,2}(\(?)[a-zA-Z0-9-_.]*@[a-zA-Z0-9\-_.]*(\)?)'
    hash_tags = '#[가-힣a-zA-Z]*'
    reporter_pattern = '[가-힣\s]{3,4}(인턴)?\s?기자'
    mudan_baepo = '무단(\s?)전재.*재배포(\s?)금지'
    gisa_etc = '기사.{2,3}(및?\s)보도자료|사진제공|기사\s?제보\s?및?|copyright|영상편집'
    photo_info = '\/?(\s?){,3}사진(\s?)='
    date_pattern = '[\d|\.]{7,12}'
    email_pattern = '[0-9a-zA-Z]([-_.]?[0-9a-zA-Z])*@[0-9a-zA-Z]([-_.]?[0-9a-zA-Z])*.[a-zA-Z]{2,3}'
    phone_number = '[0-9]{2,3}-[0-9]{3,4}-[0-9]{4}'
    site_addr = 'http(s?)://[a-zA-Z\.\/0-9^가-힣]*|www[a-zA-Z0-9.]*'
    press_name1 = '스포츠(\s?)한국|스포티비뉴스|뉴스1|bnt뉴스|공감언론|뉴시스|시사기획(\s?)창|TV조선|한국경제|데일리안'
    press_name2 = '연합뉴스TV|뉴스엔|마이데일리|동아닷컴|게티이미지코리아'
    bracket_pattern = '\[([^\]]+)\]|\(([^\)]+)\)|\【([^\】]+)\】|<[^>]+>'
    special_char = '[·☞→\*=<>《》\[\]ⓒ©◇◆□■△▲▶▷◀◁/~#&\+á?\xc3\xa1\-\|\:\;\!\-\,\_\~\$\'\"‘’“”\'\"]'
    multi_space = '\s{2,}'
    
    refined = re.sub(reporter_with_mail, '', text)
    refined = re.sub(hash_tags, '', refined)
    refined = re.sub(reporter_pattern, '', refined)
    refined = re.sub(mudan_baepo, '', refined)
    refined = re.sub(gisa_etc, '', refined)
    refined = re.sub(photo_info, '', refined)
    refined = re.sub(date_pattern, '', refined)
    refined = re.sub(email_pattern, '', refined)
    refined = re.sub(phone_number, '', refined)
    refined = re.sub(site_addr, '', refined)
    refined = re.sub(press_name1, '', refined)
    refined = re.sub(press_name2, '', refined)
    refined = re.sub(bracket_pattern, '', refined)
    refined = re.sub(special_char, '', refined)
    refined = re.sub(multi_space, ' ', refined)
    
    return refined

# 전체 기사별 전처리 함수

In [5]:
def en_article_filter(df):
    ko_articles = df[df['body'].str.count('[a-z|A-Z|\s|\d|\,|\.|\'|\(|\)|+|-|\/|=|-|\[|\]|\:]{100,}') == 0]
    return ko_articles

In [6]:
def content_proc(text):
    reporter_with_mail = '[가-힣]{2,4}(\s?){1,2}(기자)?(\s?){1,2}(\(?)[a-zA-Z0-9-_.]*@[a-zA-Z0-9\-_.]*(\)?)'
    hash_tags = '#[가-힣a-zA-Z]*'
    reporter_pattern = '[가-힣\s]{3,4}(인턴)?\s?기자'
    mudan_baepo = '무단(\s?)전재.*재배포(\s?)금지'
    gisa_etc = '기사.{2,3}(및?\s)보도자료|사진제공|기사\s?제보\s?및?|copyright|영상편집'
    photo_info = '\/?(\s?){,3}사진(\s?)='
    date_pattern = '[\d|\.]{7,12}'
    email_pattern = '[0-9a-zA-Z]([-_.]?[0-9a-zA-Z])*@[0-9a-zA-Z]([-_.]?[0-9a-zA-Z])*.[a-zA-Z]{2,3}'
    phone_number = '[0-9]{2,3}-[0-9]{3,4}-[0-9]{4}'
    site_addr = 'http(s?)://[a-zA-Z\.\/0-9^가-힣]*|www[a-zA-Z0-9.]*'
    press_name1 = '스포츠(\s?)한국|스포티비뉴스|뉴스1|bnt뉴스|공감언론|뉴시스|시사기획(\s?)창|TV조선|한국경제|데일리안|연합뉴스|[nN]ews1'
    press_name2 = '연합뉴스TV|뉴스엔|마이데일리|동아닷컴|게티이미지코리아|조선일보|통신사|노컷뉴스|SPOTV|NEWS|엑스포츠뉴스|코리아\s?및\s?전재\s?-?'
    press_name3 = '매경닷컴|MK스포츠|스타투데이|star.mk.co.kr|매일경제|mk.co.kr|디지털타임스|무단복제|Copyrights|YTN|저작권자'
    press_name4 = '재배포\s?금지|OSEN|서울신문사|MBC|한경닷컴|yright|리얼타임|머니투데이\All\sRights\sReserved|CBS|경향신문|경향닷컴'
    bracket_pattern = '\[([^\]]+)\]|\(([^\)]+)\)|\【([^\】]+)\】|<[^>]+>'
    special_char = '[·☞━→\*=<>《》\(\)\[\]ⓒ©○●◇◆□■△▲▶▷◀◁/~#&\+á?\xc3\xa1\-\|\:\;\!\-\,\_\~\$\'\"‘’“”\'\"]'
    multi_space = '\s{2,}'
    
    refined = re.sub(reporter_with_mail, '', text)
    refined = re.sub(hash_tags, '', refined)
    refined = re.sub(reporter_pattern, '', refined)
    refined = re.sub(mudan_baepo, '', refined)
    refined = re.sub(gisa_etc, '', refined)
    refined = re.sub(photo_info, '', refined)
    refined = re.sub(date_pattern, '', refined)
    refined = re.sub(email_pattern, '', refined)
    refined = re.sub(phone_number, '', refined)
    refined = re.sub(site_addr, '', refined)
    refined = re.sub(press_name1, '', refined)
    refined = re.sub(press_name2, '', refined)
    refined = re.sub(press_name3, '', refined)
    refined = re.sub(press_name4, '', refined)
    refined = re.sub(bracket_pattern, '', refined)
    refined = re.sub(special_char, '', refined)
    refined = re.sub(multi_space, ' ', refined)
    
    return refined

In [7]:
# def get_article(df):
#     articles = []
#     for i in tqdm(range(len(df))):
#         try:
#             original_text = df.loc[i].body
#             procd_article = content_proc(original_text)
#             sent_split = kss.split_sentences(procd_article)
            
#             sent_arr = []
#             para = []
#             paragraphs = []

#             for s in split_t:
#                 try:
#                     sent_arr.append(' '.join(m.morphs(s)))
#                 except:
#                     pass

#             # morph된 마지막 문장이 너무 짧을 때 삭제
#             if len(sent_arr[-1]) < 5:    
#                 del sent_arr[-1]

#             for sent in sent_arr:
#                 if len(para) < 4:
#                     para.append(sent)
#                 else:
#                     para.append(sent)
#                     para_concat = '\n'.join(para)
#                     paragraphs.append(para_concat + '\n\n')
#                     para = []
#                     if (len(sent_arr) - idx - 1) // 5 != 0:
#                         para.append(sent)
#                         if idx == len(sent_arr) - 1:
#                             para_concat = '\n'.join(para)
#                             paragraphs.append(para_concat + '\n\n')
#                             para = []
                            
#             article = ''.join(paragraphs)
#             print(article)
#         except:
#             pass

#     return articles

In [8]:
def get_article(df):
    articles = []
    for i in tqdm(df.index):
        original_text = df.loc[i].body
        procd_article = content_proc(original_text)
        sent_split = kss.split_sentences(procd_article)
        if len(sent_split) > 4:
            sent_join = '\n'.join(sent_split[:-2])
            sent_join = re.sub('\s{2,}\.', '', sent_join)
            articles.append(sent_join)
        elif len(sent_split) <= 4:
            pass
    return articles

# 모든 csv 파일 전처리 후 저장

In [9]:
print(path)
files

./NEWS_csv/


['news_body_2018.csv',
 'news_body_2019_1.csv',
 'news_body_2019_2.csv',
 'news_body_2019_3.csv',
 'news_body_2019_4.csv',
 'news_body_2020_1.csv',
 'news_body_2020_2.csv',
 'news_body_2020_3.csv',
 'news_body_2020_4.csv',
 'news_body_2021.csv']

In [None]:
for file in files:
    df = pd.read_csv(path + file)
    ko_articles = en_article_filter(df)
    articles = get_article(ko_articles)
    full_text = '\n'.join(articles)
    full_text_fix = re.sub('\n{3,}', '\n\n\n', full_text)
    file_name = file.replace('.csv', '.txt')
    with open(file_name, 'w', encoding='utf-8') as f:
        f.write(full_text_fix)

In [10]:
df = pd.read_csv(path + files[-1])

In [11]:
df

Unnamed: 0,body
0,epa09118476 Media members walk through the dig...
1,epa09118480 Media members look at the digital ...
2,epa09118479 Media members look at the digital ...
3,epa09118478 A media member looks at the digita...
4,[서울=뉴시스]정의용 외교부 장관과 왕이 중국 외교부장이 3일 중국 푸젠성 샤먼 하...
...,...
1395347,드림캐쳐 드림캐쳐의 새로운 세계가 눈을 뜬다. 드림캐쳐는 26일 오후 6시 여섯 번...
1395348,그룹 투모로우바이투게더가 오리콘 차트 정상을 지켰다. 빅히트엔터테인먼트 제공 그룹 ...
1395349,[스타뉴스 한해선 기자] /사진=MBC '열정 만수르' 동방신기 유노윤호가 '라디오...
1395350,[스타뉴스 김미화 기자] 배주현(아이린) / 사진='더블패티' 무대에서 독보적인 카...


In [144]:
len(df)

1395352

In [12]:
ko_articles = en_article_filter(df)

In [13]:
len(ko_articles)

1172695

In [146]:
len(articles)

308

In [153]:
articles = get_article(ko_articles)

  0%|          | 0/1172695 [00:00<?, ?it/s]

In [155]:
len(articles)

898325

In [160]:
full_text = '\n\n\n'.join(articles[600001:])
full_text_fix = re.sub('\n{4,}', '\n\n\n', full_text)
file_name = file.replace('.csv', '_03.txt')
with open(file_name, 'w', encoding='utf-8') as f:
    f.write(full_text_fix)

# 파일 용량 맞춰 나누기

In [3]:
path = "./NEWS_txt/"
files = os.listdir(path)
files.sort()

In [4]:
files

['news_body_2018.txt',
 'news_body_2019_1.txt',
 'news_body_2019_2.txt',
 'news_body_2019_3.txt',
 'news_body_2019_4.txt',
 'news_body_2020_1.txt',
 'news_body_2020_2.txt',
 'news_body_2020_3.txt',
 'news_body_2020_4.txt',
 'news_body_2021.txt']

In [5]:
with open(path + files[8], 'r', encoding='utf-8') as f:
    news_202 = f.read()
    
article = news_202.split('\n\n\n')
print(len(article))

558795


In [6]:
len(article) - 280000

278795

In [8]:
with open(path + files[9], 'r', encoding='utf-8') as f:
    news_2020 = f.read()
    
articles = news_2020.split('\n\n\n')
print(len(articles))

719182


In [12]:
len(articles) - 340000

379182

In [14]:
file_name = files[9].replace('.txt', '_2.txt')

with open('./NEWS_splitted/' + file_name, 'w', encoding='utf-8') as f:
    f.write('\n\n\n'.join(articles[530000:]))