In [4]:
from io import open
import unicodedata
import string
import re
import random

In [5]:
# 유니 코드 문자열을 일반 ASCII로 변환하십시오.
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# 소문자, 다듬기, 그리고 문자가 아닌 문자 제거
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [15]:
lines = open('data/data.txt', encoding='utf-8').\
    read().strip().split('\n')

# 모든 줄을 쌍으로 분리하고 정규화
pairs = [[normalizeString(s) for s in l.split('\t')][0] for l in lines]
pairs = [[s, s] for s in pairs]

In [16]:
pairs

[['go .', 'go .'],
 ['run !', 'run !'],
 ['run !', 'run !'],
 ['wow !', 'wow !'],
 ['fire !', 'fire !'],
 ['help !', 'help !'],
 ['jump .', 'jump .'],
 ['stop !', 'stop !'],
 ['stop !', 'stop !'],
 ['stop !', 'stop !'],
 ['wait !', 'wait !'],
 ['wait !', 'wait !'],
 ['i see .', 'i see .'],
 ['i try .', 'i try .'],
 ['i won !', 'i won !'],
 ['i won !', 'i won !'],
 ['oh no !', 'oh no !'],
 ['attack !', 'attack !'],
 ['attack !', 'attack !'],
 ['cheers !', 'cheers !'],
 ['cheers !', 'cheers !'],
 ['cheers !', 'cheers !'],
 ['get up .', 'get up .'],
 ['got it !', 'got it !'],
 ['got it !', 'got it !'],
 ['got it ?', 'got it ?'],
 ['got it ?', 'got it ?'],
 ['got it ?', 'got it ?'],
 ['hop in .', 'hop in .'],
 ['hop in .', 'hop in .'],
 ['hug me .', 'hug me .'],
 ['hug me .', 'hug me .'],
 ['i fell .', 'i fell .'],
 ['i fell .', 'i fell .'],
 ['i know .', 'i know .'],
 ['i left .', 'i left .'],
 ['i left .', 'i left .'],
 ['i lost .', 'i lost .'],
 ['i m .', 'i m .'],
 ['i m ok .', 'i m ok

In [3]:
from io import open
import unicodedata
import string
import re
import random

SOS_token = 0
EOS_token = 1

MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)


class Lang:
    def __init__(self):
        #self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # SOS 와 EOS 포함

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


# 유니 코드 문자열을 일반 ASCII로 변환하십시오.
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# 소문자, 다듬기, 그리고 문자가 아닌 문자 제거
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def readLangs():
    print("Reading lines...")

    # 파일을 읽고 줄로 분리
    lines = open('data/data.txt', encoding='utf-8').\
        read().strip().split('\n')

    # 모든 줄을 쌍으로 분리하고 정규화
    pairs = [[normalizeString(s) for s in l.split('\t')][0] for l in lines]
    pairs = [[s, s] for s in pairs]

    lang = Lang()

    return lang, pairs


def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and p[0].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]


In [None]:
def prepareData():
    lang, pairs = readLangs()
    pairs = filterPairs(pairs)

    for pair in pairs:
        lang.addSentence(pair[0])
    

    half_sent = int(len(pairs) / 2)

    grouped = [[pairs[2*i][0], pairs[2*i+1][0]] for i in range((half_sent))]
    grouped = [[s, s] for s in grouped]

    return lang, pairs, grouped

In [2]:
lang, pairs, grouped = prepareData()

Reading lines...
Read 135842 sentence pairs
Trimmed to 11484 sentence pairs
Counting words...
Counted words:
2978


# dataloader coder

In [208]:
import torch


class Vocab():
    def __init__(self, embed, word2id):
        self.embed = embed
        self.word2id = word2id
        self.id2word = {v: k for k, v in word2id.items()}
        assert len(self.word2id) == len(self.id2word)
        self.PAD_IDX = 0
        self.UNK_IDX = 1
        self.PAD_TOKEN = 'PAD_TOKEN'
        self.UNK_TOKEN = 'UNK_TOKEN'

    def __len__(self):
        return len(word2id)

    def i2w(self, idx):
        return self.id2word[idx]

    def w2i(self, w):
        if w in self.word2id:
            return self.word2id[w]
        else:
            return self.UNK_IDX

    def make_features(self, batch, sent_trunc=50, doc_trunc=100, split_token='\n'):
        sents_list, targets, doc_lens = [], [], []
        # trunc document
        for doc, label in zip(batch['doc']):
            sents = doc.split(split_token)
            max_sent_num = min(doc_trunc, len(sents))
            sents = sents[:max_sent_num]
            sents_list += sents
            targets += sents
            doc_lens.append(len(sents))

            
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

            
        features = []
        for sent in batch_sents:
            feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len - len(sent))]
            features.append(feature)

        features = torch.LongTensor(features)
        targets = torch.LongTensor(features)

        return features, targets, summaries, doc_lens

    def make_predict_features(self, batch, sent_trunc=150, doc_trunc=100, split_token='. '):
        sents_list, doc_lens = [], []
        for doc in batch:
            sents = doc.split(split_token)
            max_sent_num = min(doc_trunc, len(sents))
            sents = sents[:max_sent_num]
            sents_list += sents
            doc_lens.append(len(sents))
        # trunc or pad sent
        max_sent_len = 0
        batch_sents = []
        for sent in sents_list:
            words = sent.split()
            if len(words) > sent_trunc:
                words = words[:sent_trunc]
            max_sent_len = len(words) if len(words) > max_sent_len else max_sent_len
            batch_sents.append(words)

        features = []
        for sent in batch_sents:
            feature = [self.w2i(w) for w in sent] + [self.PAD_IDX for _ in range(max_sent_len - len(sent))]
            features.append(feature)

        features = torch.LongTensor(features)

        return features, doc_lens

In [210]:
import csv
import torch
import torch.utils.data as data
from torch.autograd import Variable
import numpy as np

class Dataset(data.Dataset):
    def __init__(self, examples):
        super(Dataset,self).__init__()
        # data: {'sents':xxxx,'labels':'xxxx', 'summaries':[1,0]}
        self.examples = examples 
        self.training = False
    def train(self):
        self.training = True
        return self
    def test(self):
        self.training = False
        return self
    def shuffle(self,words):
        np.random.shuffle(words)
        return ' '.join(words)
    def dropout(self,words,p=0.3):
        l = len(words)
        drop_index = np.random.choice(l,int(l*p))
        keep_words = [words[i] for i in range(l) if i not in drop_index]
        return ' '.join(keep_words)
    def __getitem__(self, idx):
        ex = self.examples[idx]
        return ex
        
    def __len__(self):
        return len(self.examples)


In [214]:
from utils import Vocab, Dataset
import json

embed = torch.Tensor(np.load("./data/embedding.npz")['embedding'])
with open("./data/word2id.json") as f:
    word2id = json.load(f)
vocab = Vocab(embed, word2id)

with open("./data/train.json") as f:
    examples = [json.loads(line) for line in f]
train_dataset = Dataset(examples)

with open("./data/val.json") as f:
    examples = [json.loads(line) for line in f]
val_dataset = Dataset(examples)

with open("./data/test.json") as f:
    examples = [json.loads(line) for line in f]
test_dataset = Dataset(examples)

In [172]:
from torch.utils.data import DataLoader

batch_size = 32
# load dataset
train_iter = DataLoader(dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True)
val_iter = DataLoader(dataset=val_dataset,
        batch_size=batch_size,
        shuffle=False)

# train-test 분할

In [175]:
with open("./data/train.json") as f:
    examples = [json.loads(line) for line in f]

In [176]:
examples[0]

{'doc': "by daily mail reporter last updated at 11:49 am on 5th october 2011 documents obtained under a federal freedom of information act lawsuit filed by judicial watch detail costs incurred during the first lady 's trip to africa and botswana in june - with the cost of firing up ' air force 2 ' alone amounting to $ 424,142\nthe white house earlier professed the purpose of the trip was to help ' youth leadership , education , health and wellness ' in africa\n' air force 2 ' : obama waves as she boards her private plane after a week - long trip to africa in june\ncosts of the flight are estimated to tally more than $ 424,000 greetings : mrs obama was met by excited children when she arrived to botswana on june 24 warm welcome : mrs obama with daughters sasha , far right , and malia , right , were greeted by traditional dancers as they arrive in gaborone , botswana judicial watch said it based the jet costs on the u.s. department of defense 's published hourly rates for the c-32a aircr

In [204]:
import json
from collections import OrderedDict

train_json = list()

for ex in examples:
    tmp = {}
    tmp["input"] = ex['doc']
    tmp["target"] = ex['doc']
    train_json.append(str(tmp).replace("'", '"')+"\n")

In [205]:
train_json[:100]

['{"input": "bolton boss lennon described referee haywood as " nothing short of appalling " after losing young hotshot zach clough and two other players to possible long - term injuries in the first half of tuesday night "s 1 - 1 draw against reading\\nlennon recoiled in disgust as kevin mcnaughton was carried of on a stretcher after being kicked in the calf by jordan obita , david wheater limped off with a hamstring injury and then zach clough suffered a suspected dislocated shoulder after falling awkwardly on the ground following a challenge by nathaniel chalobah\\nthe diminuitive zach clough , who turns 20 on sunday , has been one of the bright spots for bolton this season and had alread notched six goals in 2015\\nzach clough suffered a suspected dislocated shoulder after falling awkwardly on the ground after a challenge david wheater limped off with a hamstring injury during the clash between bolton and reading kevin mcnaughton was carried of on a stretcher after being kicked in t

In [206]:
with open('data/ae_train.json', 'w') as f:
    f.writelines(train_json)

In [200]:
import json
from collections import OrderedDict

with open("./data/val.json") as f:
    examples = [json.loads(line) for line in f]
    
val_json = list()

for ex in examples:
    tmp = OrderedDict()
    tmp['input'] = ex['doc']
    tmp['target'] = ex['doc']
    val_json.append(str(tmp)+"\n")
    
with open('data/ae_val.json', 'w') as f:
    f.writelines(val_json)

In [155]:
lines_train = [i+"\t"+i+"\n" for i in lines_train]
lines_test = [i+"\t"+i+"\n" for i in lines_test]

In [156]:
with open('data/train.txt', 'w') as f:
    f.writelines(lines_train)
with open('data/test.txt', 'w') as f:
    f.writelines(lines_test)
 
