In [24]:
from sqlalchemy import Column, Integer, String, create_engine, ForeignKey
from sqlalchemy.ext.declarative import declarative_base

In [25]:
from sqlalchemy.orm import sessionmaker

In [26]:
engine = create_engine('sqlite:///corpus.db', echo=True)

In [27]:
base = declarative_base()

In [28]:
class SejongCorpus(base):
    __tablename__ = 'SEJONG_CORPUS'
    
    pk = Column('pk', Integer, primary_key=True, autoincrement=True)
    ko_text = Column('ko_text', String, nullable=True)
    thai_text = Column('thai_text', String, nullable=True)
    file_name = Column('file_name', String, nullable=True)
    sent_count = Column('sent_count', Integer, nullable=True)
    cumsum_count = Column('cumsum_count', Integer, nullable=True)
    
    def __repr__(self):
        return 'pk:{}, ko_text:{}, thai_text:{}'.format(self.pk, self.ko_text, self.thai_text)

In [29]:
Session = sessionmaker(bind=engine)
session = Session()

In [35]:
ko_corpus = [r[0] for r in session.query(SejongCorpus.ko_text, SejongCorpus.thai_text).all()]
thai_corpus = [r[1] for r in session.query(SejongCorpus.ko_text, SejongCorpus.thai_text).all()]

2020-08-10 12:28:55,155 INFO sqlalchemy.engine.base.Engine SELECT "SEJONG_CORPUS".ko_text AS "SEJONG_CORPUS_ko_text", "SEJONG_CORPUS".thai_text AS "SEJONG_CORPUS_thai_text" 
FROM "SEJONG_CORPUS"
2020-08-10 12:28:55,166 INFO sqlalchemy.engine.base.Engine ()
2020-08-10 12:29:13,091 INFO sqlalchemy.engine.base.Engine SELECT "SEJONG_CORPUS".ko_text AS "SEJONG_CORPUS_ko_text", "SEJONG_CORPUS".thai_text AS "SEJONG_CORPUS_thai_text" 
FROM "SEJONG_CORPUS"
2020-08-10 12:29:13,092 INFO sqlalchemy.engine.base.Engine ()


In [36]:
len(ko_corpus), len(thai_corpus)

(497391, 497391)

In [30]:
corpus_all = session.query(SejongCorpus.ko_text, SejongCorpus.thai_text).all()

2020-08-10 12:24:09,760 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2020-08-10 12:24:09,779 INFO sqlalchemy.engine.base.Engine ()
2020-08-10 12:24:09,786 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2020-08-10 12:24:09,788 INFO sqlalchemy.engine.base.Engine ()
2020-08-10 12:24:09,792 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2020-08-10 12:24:09,806 INFO sqlalchemy.engine.base.Engine SELECT "SEJONG_CORPUS".ko_text AS "SEJONG_CORPUS_ko_text", "SEJONG_CORPUS".thai_text AS "SEJONG_CORPUS_thai_text" 
FROM "SEJONG_CORPUS"
2020-08-10 12:24:09,810 INFO sqlalchemy.engine.base.Engine ()


In [31]:
with open('sejong.txt', 'w+', encoding='utf-8') as f:
    for corpus in corpus_all:
        f.write(corpus[0]+'\t'+corpus[1]+'\n')

In [32]:
with open('sejong.txt', 'r', encoding='utf-8') as f:
    a = f.read()

In [33]:
a_list = a.split('\n')

In [34]:
a_list[0]

'아이티의 임시 정부를 이끌게 될 대통령으로 13일 취임한 에르타 트루이요 여사는 남성 우위의 나라에서 수많은 여성 최초의 기록을 세운 강인한 인상의 존경받는 판사이다.\tนางเออร์ตาทรูโยซึ่งเข้ารับตำแหน่งเมื่อวันที่ 13 ในฐานะประธานาธิบดีซึ่งจะเป็นผู้นำรัฐบาลชั่วคราวของเฮติเป็นผู้พิพากษาที่ได้รับความเคารพและมีความประทับใจอย่างมากซึ่งสร้างสถิติเป็นครั้งแรกสำหรับผู้หญิงจำนวนมากในประเทศที่มีการปกครองแบบผู้ชาย'

In [37]:
with open('corpus.ko', 'w+', encoding='utf-8') as f:
    for ko in ko_corpus:
        f.write(ko+'|')

In [38]:
with open('corpus.th', 'w+', encoding='utf-8') as f2:
    for th in thai_corpus:
        f2.write(th+'|')