In [1]:
from sqlalchemy import Column, Integer, String, create_engine, ForeignKey
from sqlalchemy.ext.declarative import declarative_base

In [2]:
from sqlalchemy.orm import sessionmaker

In [3]:
engine = create_engine('sqlite:///corpus.db', echo=True)

In [4]:
base = declarative_base()

In [5]:
class SejongCorpus(base):
    __tablename__ = 'SEJONG_CORPUS'
    
    pk = Column('pk', Integer, primary_key=True, autoincrement=True)
    ko_text = Column('ko_text', String, nullable=True)
    thai_text = Column('thai_text', String, nullable=True)
    file_name = Column('file_name', String, nullable=True)
    sent_count = Column('sent_count', Integer, nullable=True)
    cumsum_count = Column('cumsum_count', Integer, nullable=True)
    
    def __repr__(self):
        return 'pk:{}, ko_text:{}, thai_text:{}'.format(self.pk, self.ko_text, self.thai_text)

In [6]:
Session = sessionmaker(bind=engine)
session = Session()

In [8]:
import re

In [51]:
unicode = [r[0] for r in session.query(SejongCorpus.ko_text, SejongCorpus.thai_text).all() if re.match(".*[\u2e80-\u2eff\u31c0-\u31ef\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fbf\uf900-\ufaff].*", r[0]) is not None]

2020-08-18 23:03:00,963 INFO sqlalchemy.engine.base.Engine SELECT "SEJONG_CORPUS".ko_text AS "SEJONG_CORPUS_ko_text", "SEJONG_CORPUS".thai_text AS "SEJONG_CORPUS_thai_text" 
FROM "SEJONG_CORPUS"
2020-08-18 23:03:00,965 INFO sqlalchemy.engine.base.Engine ()


In [29]:
ko_corpus = [r[0] for r in session.query(SejongCorpus.ko_text, SejongCorpus.thai_text).all() if re.match(".*[\u2e80-\u2eff\u31c0-\u31ef\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fbf\uf900-\ufaff].*", r[0]) is None and '.' in r[0]]
thai_corpus = [r[1]+' ' for r in session.query(SejongCorpus.ko_text, SejongCorpus.thai_text).all() if re.match(".*[\u2e80-\u2eff\u31c0-\u31ef\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fbf\uf900-\ufaff].*", r[0]) is None and '.' in r[0]]

2020-08-24 19:02:10,617 INFO sqlalchemy.engine.base.Engine SELECT "SEJONG_CORPUS".ko_text AS "SEJONG_CORPUS_ko_text", "SEJONG_CORPUS".thai_text AS "SEJONG_CORPUS_thai_text" 
FROM "SEJONG_CORPUS"
2020-08-24 19:02:10,630 INFO sqlalchemy.engine.base.Engine ()
2020-08-24 19:02:14,599 INFO sqlalchemy.engine.base.Engine SELECT "SEJONG_CORPUS".ko_text AS "SEJONG_CORPUS_ko_text", "SEJONG_CORPUS".thai_text AS "SEJONG_CORPUS_thai_text" 
FROM "SEJONG_CORPUS"
2020-08-24 19:02:14,599 INFO sqlalchemy.engine.base.Engine ()


In [30]:
len(ko_corpus), len(thai_corpus)

(438771, 438771)

In [25]:
from sklearn.model_selection import train_test_split

In [32]:
corpus_train_ko, corpus_test_ko = train_test_split(ko_corpus, test_size=0.2) # train / test 
corpus_train_ko, corpus_valid_ko = train_test_split(corpus_train_ko, test_size=0.1) # train / validation
corpus_train_th, corpus_test_th = train_test_split(thai_corpus, test_size=0.2) # train /test 
corpus_train_th, corpus_valid_th = train_test_split(corpus_train_th, test_size=0.1) # train / validation

In [33]:
len(corpus_train_ko), len(corpus_valid_ko), len(corpus_test_ko)

(315914, 35102, 87755)

In [36]:
with open('corpus.train.ko', 'w+', encoding='utf-8') as f:
    for ko in ko_corpus:
        f.write(ko+'|')
        
with open('corpus.train.th', 'w+', encoding='utf-8') as f:
    for th in thai_corpus:
        f.write(th+'|')

In [34]:
with open('corpus.train.ko', 'w+', encoding='utf-8') as f:
    for ko in corpus_train_ko:
        f.write(ko+'|')
        
with open('corpus.valid.ko', 'w+', encoding='utf-8') as f:
    for ko in corpus_valid_ko:
        f.write(ko+'|')
        
with open('corpus.test.ko', 'w+', encoding='utf-8') as f:
    for ko in corpus_test_ko:
        f.write(ko+'|')
        
with open('corpus.train.th', 'w+', encoding='utf-8') as f:
    for th in corpus_train_th:
        f.write(th+'|')
        
with open('corpus.valid.th', 'w+', encoding='utf-8') as f:
    for th in corpus_valid_th:
        f.write(th+'|')
        
with open('corpus.test.th', 'w+', encoding='utf-8') as f:
    for th in corpus_test_th:
        f.write(th+'|')