# Press 데이터 가져오기

In [1]:
import os
import pymysql
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

# Maria
MARIA_USER = os.getenv("DB_USER")
MARIA_PASSWORD = os.getenv("DB_PASS")
MARIA_IP = os.getenv("DB_IP")
MARIA_PORT = os.getenv("DB_PORT")
MARIA_NAME = os.getenv("DB_NAME")

def connect_maria_db():
    conn = pymysql.connect(host=MARIA_IP, user=MARIA_USER, password=MARIA_PASSWORD, db=MARIA_NAME, charset='utf8')
    return conn

def execute_query(query):
    conn = connect_maria_db()
    df = pd.read_sql(query, conn)
    return df

query = """SELECT content FROM press_content"""
df = execute_query(query)

  df = pd.read_sql(query, conn)


In [2]:
# Check type
import json

types = set()
for i in range(len(df)):
    eles = df.iloc[i]['content']
    try:
        ele_list = json.loads(eles)
    except:
        continue
    for ele in ele_list:
        types.add(ele['type'])
        
print(types)

{'table', 'pdf', 'txt', 'image'}


# Pre-processing

- Text 추출
- Sentence 분리
- 특수 문자 제거
- 5음절 이상 사용

In [12]:
import re
import nltk
import json


# nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def extract_text(element):
    data_list = json.loads(element, strict=False)
    output_text = ""
    for data in data_list:
        if data['type'] in ("image", "pdf", "table"): continue
        output_text += data['data']
    return output_text

def preprocess(element):
    
    # 특수 문자 제거 (이걸 제거하면 특수문자를 생성할 수 없음 있는 그대로 사용하기.)
    # element = re.sub("[^\w\s.,\(\)]+|_+", " ", element).strip()
    # 2개 이상 공간 제거 및 문장 내 newline 제거 (\n도 제거)
    element = re.sub("[\s]+", " ", element).strip()
    return element

def postprocess(elements):
    new_elements = []
    for element in elements:
        # 음절 5개 초과
        if 5 >= len(element.split(" ")): continue
        # 특수문자로만 구성된 문장제거
        if not re.search('[a-zA-Z0-9가-힇ㄱ-ㅎㅏ-ㅣぁ-ゔァ-ヴー々〆〤一-龥]', element): continue
        new_elements.append(element)
    return new_elements


# Only Text
df['content'] = df['content'].map(extract_text)
# Preprocess
df['content'] = df['content'].map(preprocess)
# Split Sentence
df['content'] = df['content'].map(sent_tokenize)
# Postprocess
df['content'] = df['content'].map(postprocess)


max_seq_len, min_seq_len = 0, float("inf")
for i in range(len(df)):
    max_seq_len = max(max_seq_len, len(df.iloc[i]['content']))
    min_seq_len = min(min_seq_len, len(df.iloc[i]['content']))
print("Max Length of Documents :", max_seq_len)
print("Min Length of Documents :", min_seq_len)

Max Length of Documents : 3983
Min Length of Documents : 0


In [13]:
# Drop empty
print("Before : ", len(df))
df.loc[df['content'].str.len() == 0] = pd.NA
df = df.dropna()
print("After : ", len(df))


Before :  632515
After :  629285


In [14]:
# 가장 긴  문장
max_len, min_len = 0, float("inf")
flag = False
for i in range(len(df)):
    for sen in df.iloc[i]['content']:
        max_len = max(max_len, len(sen))
        min_len = min(min_len, len(sen))

print("가장 긴 문장 : ", max_len)
print("가장 짧은 문장 : ", min_len)

가장 긴 문장 :  158323
가장 짧은 문장 :  11


# Create Train Data for Tokenizer

In [15]:
with open('../tokenizer/train.txt', 'w', encoding='utf-8') as f:
    for i in range(len(df)):
        sentence_list = df.iloc[i]['content']
        for sentence in sentence_list:
            sentence = sentence.strip()
            f.write(sentence)
            f.write("\n")

# Create Pretrain-Data for BERT

### prerequisite

- Train Tokenizer :  `python train_tokenizer.py` (Take about 10 minutes)

In [16]:
print("Number of Documents : ", len(df))

Number of Documents :  629285


In [18]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("../tokenizer")

print("Vocab size : ", tokenizer.vocab_size)
print("Special tokens", tokenizer.special_tokens_map)

Vocab size :  32000
Special tokens {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}


## Split Data

In [19]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(df, test_size=0.1, random_state=42)

print(f"Number of train data : {len(train_df)}")
print(f"Number of valid data : {len(valid_df)}")

Number of train data : 566356
Number of valid data : 62929


In [20]:
import random
import tqdm

random.seed(42)

max_len = 512
mlm_prob = 0.15 # -> 80% : [MASK], 10% : random word, 10% : keep same

def create_pretrain_instances(docs, doc_idx, doc, max_len, mlm_prob, vocab_list):
    # CLS tokens SEP tokens SEP
    max_seq = max_len - 3

    instances = []
    cur_chunk = []
    cur_length = 0
    
    for i in range(len(doc)):
        cur_chunk.append(doc[i])
        cur_length += len(doc[i])

        if i == len(doc)-1 or cur_length >= max_seq:
            if 0 < len(cur_chunk):
                a_end = 1
                if 1 < len(cur_chunk):
                    a_end = random.randrange(1, len(cur_chunk))
                
                tokens_a = []
                for j in range(a_end):
                    tokens_a.extend(cur_chunk[j])
                
                tokens_b = []
                if len(cur_chunk) == 1 or random.random() < 0.5:
                    is_next = 0
                    random_doc_idx = doc_idx

                    while doc_idx == random_doc_idx:
                        random_doc_idx = random.randrange(0, len(docs))
                    
                    random_doc = docs.iloc[random_doc_idx]['content']

                    random_start = random.randrange(0, len(random_doc))
                    for j in range(random_start, len(random_doc)):
                        tokens_b.extend(random_doc[j])

                else:
                    is_next = 1
                    for j in range(a_end, len(cur_chunk)):
                        tokens_b.extend(cur_chunk[j])

                trim_tokens(tokens_a, tokens_b, max_seq)
                assert 0 < len(tokens_b)
                assert 0 < len(tokens_b)

                tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
                segment = [0] * (len(tokens_a) + 2) + [1] * (len(tokens_b) + 1)
                tokens, mask_idx, mask_label = create_pretrain_mask(tokens, int((len(tokens)-3) * mlm_prob), vocab_list)
                instance = {
                    "input_tokens": tokens,
                    "segment": segment,
                    "is_next": is_next,
                    "mask_idx": mask_idx,
                    "mask_label": mask_label
                }
                instances.append(instance)
            cur_chunk = []
            cur_length = 0

    return instances


def trim_tokens(tokens_a, tokens_b, max_seq):
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_seq:
            break
        if len(tokens_a) > len(tokens_b):
            del tokens_a[0]
        else:
            tokens_b.pop()


def create_pretrain_mask(tokens, mask_cnt, vocab_list):
    cand_idx = []
    for i, token in enumerate(tokens):
        if token == "[CLS]" or token == "[SEP]": continue

        if 0 < len(cand_idx) and token.startswith("##"):
            cand_idx[-1].append(i)
        
        else:
            cand_idx.append([i])
    
    random.shuffle(cand_idx)

    mask_lms = []
    for index_set in cand_idx:
        if len(mask_lms) >= mask_cnt: break
        if len(mask_lms) + len(index_set) > mask_cnt: continue

        for index in index_set:
            masked_token = None
            if random.random() < 0.8:
                masked_token = "[MASK]"
            else:
                if random.random() < 0.5:
                    masked_token = tokens[index]
                else:
                    masked_token = random.choice(vocab_list)
            
            mask_lms.append({"index": index, "label": tokens[index]})
            tokens[index] = masked_token
    
    mask_lms = sorted(mask_lms, key=lambda x:x["index"])
    mask_idx = [p["index"] for p in mask_lms]
    mask_label = [p["label"] for p in mask_lms]

    return tokens, mask_idx, mask_label


def main(tokenizer, df, output_path, max_len, mlm_prob, count=10):
    vocab_list = list(tokenizer.vocab)
    vocab_list.remove("[UNK]")

    for i in range(len(df)):
        sentences = df.iloc[i]['content']
        tokenized_sentences = []
        for sentence in sentences:
            tokenized_sentences.append(tokenizer.tokenize(sentence))
        df.iloc[i]['content'] = tokenized_sentences
    
    for index in range(count):
        output = output_path.format(index)
        with open(output, 'w') as out_file:
            with tqdm.tqdm(total=len(df), desc="masking") as pbar:
                for i in range(len(df)):
                    doc = df.iloc[i]['content']
                    instances = create_pretrain_instances(
                                    df,
                                    i,
                                    doc,
                                    max_len,
                                    mlm_prob,
                                    vocab_list
                                )
                    
                    for instance in instances:
                        out_file.write(json.dumps(instance, ensure_ascii=False))
                        out_file.write('\n')
                    pbar.update(1)

train_output_path = "pretrain_data_{}.json"
valid_output_path = "valid_data_{}.json"
main(tokenizer, train_df, train_output_path, max_len, mlm_prob)
main(tokenizer, valid_df, valid_output_path, max_len, mlm_prob, count=1)

masking: 100%|██████████| 566356/566356 [11:50<00:00, 797.09it/s]
masking: 100%|██████████| 566356/566356 [11:52<00:00, 794.77it/s]
masking: 100%|██████████| 566356/566356 [11:51<00:00, 795.47it/s]
masking: 100%|██████████| 566356/566356 [11:48<00:00, 799.84it/s]
masking: 100%|██████████| 566356/566356 [11:51<00:00, 796.50it/s]
masking: 100%|██████████| 566356/566356 [11:50<00:00, 797.00it/s]
masking: 100%|██████████| 566356/566356 [11:48<00:00, 799.55it/s]
masking: 100%|██████████| 566356/566356 [11:49<00:00, 798.40it/s]
masking: 100%|██████████| 566356/566356 [11:50<00:00, 797.04it/s]
masking: 100%|██████████| 566356/566356 [11:51<00:00, 796.07it/s]
masking: 100%|██████████| 62929/62929 [01:19<00:00, 793.72it/s]
