In [10]:
import pandas as pd
import numpy as np
import konlpy
from konlpy.tag import Kkma
import re

In [11]:
df = pd.read_csv('ytn_final.csv', header = None)

In [12]:
text = df[4][56]

In [13]:
def endRemove(og_text):
    # Only remove extra information from end of article
    front_text = og_text[:-200]
    end_text = og_text[-200:]
    end_text = re.sub('\(?[0-9A-Za-z]*\@[A-Za-z]+\.[a-z]*\.*[a-z]*\)?', '', end_text) #email removed
    end_text = re.sub('\[[()0-9A-Za-zㄱ-ㅎㅏ-ㅣ가-힣 ]*\] ', '', end_text) #last comments removed
    end_text = re.sub('YTN [가-힣 ]*', '', end_text) #ytn journalist removed
    end_text = re.sub('※ \'[A-Za-zㄱ-ㅎㅏ-ㅣ가-힣 ]*\'', '', end_text) #message removed
    end_text = re.sub('[0-9]+-[0-9]+-[0-9]+', '', end_text) #phone number removed
    new_text = front_text + end_text
    return new_text

In [14]:
def speakRemove(og_text):
    # Remove any script-like aspects
    text = re.sub('\[앵커\]', '', og_text)
    text = re.sub('\[기자\]', '', text)
    text = re.sub('[\[\]]', '', text)
    return text

In [15]:
kkma = Kkma()
def text_analysis(text):
    # This will take a slightly long time!!!
    text_1 = endRemove(text)
    text_1 = speakRemove(text_1)
    tokens = kkma.morphs(text_1)
    pos_tokens = kkma.pos(text_1)
    return {'text': text_1, 'tokens': tokens, 'pos': pos_tokens}
    

## Dataset: dictionary

- Dataset['text'] -> list of text strings
- Dataset['tokens'] -> list of numpy arrays of morpheme tokens
- Dataset['pos'] -> list of numpy arrays of pos-tagged tokens


In [52]:
Dataset = {'text': [], 'tokens': [], 'pos': []}
for i, art in enumerate(df[4]):
    result = text_analysis(art)
    Dataset['text'].append(result['text'])
    Dataset['tokens'].append((result['tokens']))
    Dataset['pos'].append((result['pos']))

In [42]:
print(Dataset['pos'])

[[('음식', 'NNG'), ('을', 'JKO'), ('씹', 'VV'), ('기', 'ETN'), ('어렵', 'VA'), ('ㄴ', 'ETD'), ('노인', 'NNG'), ('은', 'JX'), ('노쇠', 'NNG'), ('위험', 'NNG'), ('이', 'JKS'), ('2.7', 'NR'), ('배', 'NNG'), ('높', 'VA'), ('아', 'ECD'), ('노년기', 'NNG'), ('에', 'JKM'), ('급격', 'XR'), ('하', 'XSA'), ('ㄴ', 'ETD'), ('노쇠', 'NNG'), ('를', 'JKO'), ('막', 'VV'), ('으려', 'ECD'), ('면', 'NNG'), ('평소', 'NNG'), ('치아', 'NNG'), ('건강', 'NNG'), ('을', 'JKO'), ('유지', 'NNG'), ('하', 'XSV'), ('는', 'ETD'), ('것', 'NNB'), ('이', 'JKS'), ('중요', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('것', 'NNB'), ('으로', 'JKM'), ('나타나', 'VV'), ('었', 'EPT'), ('습니다', 'EFN'), ('.', 'SF'), ('서울', 'NNG'), ('아산', 'NNP'), ('병원', 'NNG'), ('노년', 'NNG'), ('내과', 'NNG'), ('정', 'NNG'), ('희원', 'NNG'), (',', 'SP'), ('빛', 'NNG'), ('고을', 'NNG'), ('전', 'NNG'), ('남대', 'NNG'), ('병원', 'NNG'), ('노년', 'NNG'), ('내과', 'NNG'), ('강', 'NNG'), ('민', 'NNG'), ('구', 'NNG'), ('교수', 'NNG'), ('공동', 'NNG'), ('연구', 'NNG'), ('팀', 'NNG'), ('은', 'JX'), ('2016', 'NR'), ('∼', 'SO'), ('2018', 'NR'), ('년'

In [53]:
import json

with open("C:/Users/lhi30/Desktop/Writing_Advice/Data/articles_ytn_tokenized.json", "w") as outfile:
    json.dump(Dataset, outfile)

In [3]:
import json
 
with open("C:/Users/lhi30/Desktop/Writing_Advice/Data/articles_ytn_tokenized.json") as json_file:
    data = json.load(json_file)

In [4]:
data['pos'][4]

[['교육', 'NNG'],
 ['단체', 'NNG'],
 ['들', 'XSN'],
 ['이', 'JKS'],
 ['IB', 'OL'],
 [',', 'SP'],
 ['국제', 'NNG'],
 ['바', 'NNG'],
 ['칼', 'NNG'],
 ['로', 'JKM'],
 ['레', 'NNG'],
 ['아', 'XSN'],
 ['도입', 'NNG'],
 ['과', 'JC'],
 ['자', 'NNG'],
 ['사고', 'NNG'],
 ['·', 'SP'],
 ['특', 'UN'],
 ['목', 'NNG'],
 ['고', 'XSN'],
 ['존치', 'NNG'],
 [',', 'SP'],
 ['대학', 'NNG'],
 ['자율권', 'NNG'],
 ['확대', 'NNG'],
 ['기조', 'NNG'],
 ['가', 'JKS'],
 ['맞물리', 'VV'],
 ['면', 'ECE'],
 [',', 'SP'],
 ['기존', 'NNG'],
 ['의', 'JKG'],
 ['학교', 'NNG'],
 ['서열', 'NNG'],
 ['체제', 'NNG'],
 ['를', 'JKO'],
 ['강화', 'NNG'],
 ['하', 'XSV'],
 ['ㄹ', 'ETD'],
 ['것', 'NNB'],
 ['이', 'VCP'],
 ['라며', 'ECE'],
 ['교육부', 'NNG'],
 ['핵심', 'NNG'],
 ['정책', 'NNG'],
 ['들', 'XSN'],
 ['을', 'JKO'],
 ['비판', 'NNG'],
 ['하', 'XSV'],
 ['었', 'EPT'],
 ['습니다', 'EFN'],
 ['.', 'SF'],
 ['사교육', 'NNG'],
 ['걱정', 'NNG'],
 ['없', 'VA'],
 ['는', 'ETD'],
 ['세상', 'NNG'],
 ['과', 'JKM'],
 ['좋', 'VA'],
 ['은', 'ETD'],
 ['교사', 'NNG'],
 ['운동', 'NNG'],
 [',', 'SP'],
 ['교육', 'NNG'],
 ['의', 'JKG'],
 ['