## 빅데이터 활용 AI 설계
# Seq2seq word level

In [1]:
import pandas as pd
import numpy as np
import string
from string import digits
import matplotlib.pyplot as plt
import re
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
lines= pd.read_table('mar.txt', names=['eng', 'mar'])

In [3]:
lines.head(2)

Unnamed: 0,eng,mar
0,Go.,जा.
1,Run!,पळ!


In [4]:
lines.shape

(35832, 2)

### 데이터 전처리

In [5]:
# Lowercase all characters
lines.eng=lines.eng.apply(lambda x: x.lower())
lines.mar=lines.mar.apply(lambda x: x.lower())

# Remove quotes
lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x))
lines.mar=lines.mar.apply(lambda x: re.sub("'", '', x))

exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.mar=lines.mar.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines.eng=lines.eng.apply(lambda x: x.translate(remove_digits))
lines.mar = lines.mar.apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines.eng=lines.eng.apply(lambda x: x.strip())
lines.mar=lines.mar.apply(lambda x: x.strip())
lines.eng=lines.eng.apply(lambda x: re.sub(" +", " ", x))
lines.mar=lines.mar.apply(lambda x: re.sub(" +", " ", x))

In [6]:
help(str.maketrans)

Help on built-in function maketrans:

maketrans(x, y=None, z=None, /)
    Return a translation table usable for str.translate().
    
    If there is only one argument, it must be a dictionary mapping Unicode
    ordinals (integers) or characters to Unicode ordinals, strings or None.
    Character keys will be then converted to ordinals.
    If there are two arguments, they must be strings of equal length, and
    in the resulting dictionary, each character in x will be mapped to the
    character at the same position in y. If there is a third argument, it
    must be a string, whose characters will be mapped to None in the result.



In [7]:
# Add start and end tokens to target sequences
lines.mar = lines.mar.apply(lambda x : 'START_ '+ x + ' _END')

In [8]:
lines.head()

Unnamed: 0,eng,mar
0,go,START_ जा _END
1,run,START_ पळ _END
2,run,START_ धाव _END
3,run,START_ पळा _END
4,run,START_ धावा _END


### 단어 사전

In [9]:
# Vocabulary of English
eng_words=set()
for eng in lines.eng:
    for word in eng.split():
        if word not in eng_words:
            eng_words.add(word)

# Vocabulary of Marathi 
mar_words=set() 
for mar in lines.mar:
    for word in mar.split():
        if word not in mar_words:
            mar_words.add(word)

In [10]:
eng_words = sorted(list(eng_words))
mar_words = sorted(list(mar_words))

len(eng_words), len(mar_words)

(5471, 13009)

In [11]:
eng_words[:10]

['a',
 'abbreviation',
 'ability',
 'able',
 'about',
 'above',
 'abroad',
 'absent',
 'absolute',
 'absolutely']

In [12]:
mar_words[:10]

['10', '100', '300', '3ds', 'START_', '_END', 'a', 'b', 'h₂o', 'ntt']

### 문장 길이

In [13]:
eng_seq_max = max([len(sen.split()) for sen in lines.eng])
mar_seq_max = max([len(sen.split()) for sen in lines.mar])

eng_seq_max, mar_seq_max

(34, 37)

### 단어 사전 만들기

In [14]:
eng_w2i = dict([(word, i+1) for i, word in enumerate(eng_words)]) # 1번 부터 매김 (0 은 뒷부분의 패팅)
mar_w2i = dict([(word, i+1) for i, word in enumerate(mar_words)]) # 1번 부터 매김 (0 은 뒷부분의 패팅)

eng_i2w = dict((i, word) for word, i in eng_w2i.items())
mar_i2w = dict((i, word) for word, i in mar_w2i.items())

### 데이터 섞기

In [15]:
lines = shuffle(lines)
lines.head()

Unnamed: 0,eng,mar
19353,i make too many mistakes,START_ मी खूपच चुका करतो _END
15429,where are my slippers,START_ माझे स्लिपर कुठे आहेत _END
4553,tom is an actor,START_ टॉम एक अभिनेता आहे _END
8241,dont do that here,START_ तसं इथे करू नकोस _END
29989,what are you going to do tomorrow,START_ तुम्ही उद्या काय करणार आहात _END


### 훈련/테스트 데이터 분리

In [16]:
X, y = lines.eng, lines.mar # 35832
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((32248,), (32248,), (3584,), (3584,))

### 배치 샘플 생성자
- 학습데이터(X_train, y_train) 에서 128 개씩 데이터를 가져온다

In [17]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, eng_seq_max),dtype='float32') # (128,34)
            decoder_input_data = np.zeros((batch_size, mar_seq_max),dtype='float32') # (128,37)
            decoder_target_data = np.zeros((batch_size, mar_seq_max, len(mar_words)+1),dtype='float32')
                # (128,37,13010), 패딩값 0 이 들어가기 때문에 len(mar_words)+1 이다.
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = eng_w2i[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1: # 끝에 '_END' 를 제외
                        decoder_input_data[i, t] = mar_w2i[word] # decoder input seq
                    if t>0: # 앞부분 'START_' 제외
                        # decoder target sequence (one hot encoded)len(mar_words)+1
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, mar_w2i[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

### Encoder

In [18]:
word_vector_dim = 50

In [19]:
encoder_input = Input(shape=(None,))
x1 =  Embedding(len(eng_words)+1, word_vector_dim, mask_zero=True)(encoder_input)
    # mask_zero=True: padding 인 0 을 처리하지 않음
encoder_outputs, state_h, state_c = LSTM(word_vector_dim, return_state=True)(x1)
encoder_states = [state_h, state_c]

### Decoder

In [20]:
decoder_input = Input(shape=(None,))
x2 = Embedding(len(mar_words)+1, word_vector_dim, mask_zero=True)(decoder_input)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_outputs, _, _ = LSTM(word_vector_dim, return_sequences=True, return_state=True)(x2, initial_state=encoder_states)
decoder_outputs = Dense(len(mar_words)+1, activation='softmax')(decoder_outputs)

In [21]:
model = Model([encoder_input, decoder_input], decoder_outputs)

### 학습

In [22]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [45]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = 128),
                    steps_per_epoch = len(X_train)//128,
                    epochs=5,
                    validation_data = generate_batch(X_test, y_test, batch_size = 128),
                    validation_steps = len(X_test)//128)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x265805fd198>

In [24]:
#model.save_weights('eng_mar_weights.h5')
model.load_weights('eng_mar_weights.h5')

In [57]:
help(model.load_weights)

Help on method load_weights in module keras.engine.network:

load_weights(filepath, by_name=False, skip_mismatch=False, reshape=False) method of keras.engine.training.Model instance
    Loads all layer weights from a HDF5 save file.
    
    If `by_name` is False (default) weights are loaded
    based on the network's topology, meaning the architecture
    should be the same as when the weights were saved.
    Note that layers that don't have weights are not taken
    into account in the topological ordering, so adding or
    removing layers is fine as long as they don't have weights.
    
    If `by_name` is True, weights are loaded into layers
    only if they share the same name. This is useful
    for fine-tuning or transfer-learning models where
    some of the layers have changed.
    
    # Arguments
        filepath: String, path to the weights file to load.
        by_name: Boolean, whether to load weights by name
            or by topological order.
        skip_mismatch: Boo

In [28]:
eng_words[:10]

['a',
 'abbreviation',
 'ability',
 'able',
 'about',
 'above',
 'abroad',
 'absent',
 'absolute',
 'absolutely']

In [31]:
mar_words[:20]

['10',
 '100',
 '300',
 '3ds',
 'START_',
 '_END',
 'a',
 'b',
 'h₂o',
 'ntt',
 'tatoebaorg',
 'uk',
 'अ',
 'अँकरेजमार्गे',
 'अँजिलीस',
 'अँड',
 'अँडी',
 'अँडीजमधल्या',
 'अँब्युलन्स',
 'अंक']

In [44]:
s = 'i am tom'
l = [eng_w2i[w] for w in s.split()]
l = l + [0]*(34-len(l))

In [46]:
a = np.array([l])
a.shape

(1, 34)

In [51]:
b = np.zeros([1,37])
b[0,0] = mar_w2i['START_']
b

array([[5., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]])

In [61]:
r = mar_i2w[np.argmax(model.predict([a,b])[0,0])]
r

'मी'

In [62]:
b = np.zeros([1,37])
b[0,0] = mar_w2i['START_']
b[0,1] = mar_w2i['मी']
b

array([[5.000e+00, 9.187e+03, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00]])

In [63]:
r = mar_i2w[np.argmax(model.predict([a,b])[0,0])]
r

'मी'

In [64]:
b = np.zeros([1,37])
b[0,0] = mar_w2i['START_']
b[0,1] = mar_w2i['मी']
b[0,2] = mar_w2i['मी']

r = mar_i2w[np.argmax(model.predict([a,b])[0,0])]
r

'मी'