In [2]:
import pandas as pd

df = pd.read_csv("spanish_emojis.csv")

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,emojis,observations
0,300,😭,? en serio han cancelado tambien quantico
1,301,😂,si on mo
2,302,😝,se duerme relooo
3,303,🤗,ahi te podes dar cuenta que diferentes somos y...
4,304,😪,625 pero la tipi esta jugando desde la 5


In [8]:
observations = df['observations'].values
len(observations)

151743

In [9]:
observations[0]

'? en serio han cancelado tambien quantico'

In [18]:
sentences = []
for sentence in observations:
    sentences.append(sentence.split())

In [68]:
flatten_word_simbols = [word_simbol for sublist in sentences for word_simbol in sublist]

len(flatten_word_simbols)

1427429

In [121]:
import collections

cnt = collections.Counter(flatten_word_simbols)
print("Vocab size = ", len(cnt))

Vocab size =  66967


In [122]:
vocab_keys = cnt.most_common()

In [123]:
vocab_keys[-1]

('sisieeeeeerto', 1)

In [124]:
stoi = {}

i = 0
for word_simbol, count in vocab_keys:
    stoi[word_simbol] = i
    i+=1
    
stoi['_end_'  ] = i
stoi['_blank_'] = i + 1

In [127]:
stoi['que'],stoi['ja'], stoi['.'],stoi['_end_'], stoi['_blank_']

(0, 1025, 22484, 66967, 66968)

In [128]:
print("Max key-value \t= ", len(stoi.keys())-1)

VOCAB_SIZE = len(stoi.keys())
print("Vocab Size \t= ", VOCAB_SIZE)

Max key-value 	=  66968
Vocab Size 	=  66969


In [129]:
itos = {}

for s, i in stoi.items():
    itos[i] = s

In [131]:
assert ('que'     == itos[stoi['que'    ]])
assert ('ja'      == itos[stoi['ja'     ]])
assert ('.'       == itos[stoi['.'      ]])
assert ('_end_'   == itos[stoi['_end_'  ]])
assert ('_blank_' == itos[stoi['_blank_']])

In [132]:
x_input = []
ls      = []

for sentence in sentences:
    i_sentence = []
    for word_simbol in sentence:
        i_sentence.append(stoi[word_simbol])
    i_sentence.append(stoi['_end_'])
    x_input.append(i_sentence)
    ls.append(len(i_sentence))

In [133]:
MAXLEN = max(ls)
print('MAXLEN :',MAXLEN)

MAXLEN : 50


In [136]:
blank_value = stoi['_blank_']

print('Blank Value = ', blank_value)

Blank Value =  66968


In [137]:
from keras.preprocessing.sequence import pad_sequences

x_train = pad_sequences(x_input, maxlen = MAXLEN, value=blank_value)

print('Shape of data train tensor:', x_train.shape)

Shape of data train tensor: (151743, 50)


In [172]:
x_train_seq  = []
y_train_next = []

SEQ_LEN      = 3

for x_input_obs in x_train:
    for i in range(MAXLEN - SEQ_LEN):
        if x_input_obs[i] != blank_value:
            x_train_seq.append(x_input_obs[i:i+SEQ_LEN])
            y_train_next.append(x_input_obs[i+SEQ_LEN])

In [209]:
import tensorflow as tf
import keras 

from keras import backend as K

print(K.tensorflow_backend._get_available_gpus())


config = tf.ConfigProto(intra_op_parallelism_threads = 4,
                        allow_soft_placement = True, 
                        device_count = {'CPU' : 1, 'GPU' : 0})

sess = tf.Session(config=config)

K.set_session(sess)

['/job:localhost/replica:0/task:0/device:GPU:0']


In [210]:
from keras.utils import to_categorical

from keras.layers import LSTM, Input, Bidirectional
from keras.models import Model

In [211]:
seq_input = Input(shape=(VOCAB_SIZE, SEQ_LEN))

#rnn       = Bidirectional(LSTM(SEQ_LEN, activation="relu"))(seq_input)
rnn       = LSTM(SEQ_LEN, activation="relu")(seq_input)
rnn       = Dropout(0.6)(rnn)
rnn       = Dense(VOCAB_SIZE)(rnn)
output    = Activation('softmax')(rnn)

model     = Model(inputs=[seq_input], outputs=[output])

model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 66969, 3)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 3)                 84        
_________________________________________________________________
dropout_6 (Dropout)          (None, 3)                 0         
_________________________________________________________________
dense_6 (Dense)              (None, 66969)             267876    
_________________________________________________________________
activation_5 (Activation)    (None, 66969)             0         
Total params: 267,960
Trainable params: 267,960
Non-trainable params: 0
_________________________________________________________________


In [None]:
from keras.callbacks import ModelCheckpoint

import numpy as np

BATCH_SIZE = 100

filepath   ="model/textG_best_weights_SIF.{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

for ibatch in range(0, len(x_train_seq), BATCH_SIZE):
    
    i_begin = ibatch
    i_end   = min(ibatch + BATCH_SIZE, len(x_train_seq))
    
    print('BATCH', i_begin, i_end)
    x_train_one_hot = to_categorical( x_train_seq[i_begin:i_end], num_classes = VOCAB_SIZE)
    y_train_one_hot = to_categorical(y_train_next[i_begin:i_end], num_classes = VOCAB_SIZE)
    x_train_one_hot = np.reshape(x_train_one_hot, (x_train_one_hot.shape[0],VOCAB_SIZE, SEQ_LEN))
    #model.fit(x_train_one_hot, y_train_one_hot, batch_size=64, epochs=1, callbacks = [checkpoint])   
    model.fit(x_train_one_hot, y_train_one_hot, batch_size=64, epochs=1)    

BATCH 0 100
Epoch 1/1
BATCH 100 200
Epoch 1/1
BATCH 200 300
Epoch 1/1
BATCH 300 400
Epoch 1/1
BATCH 400 500
Epoch 1/1
BATCH 500 600
Epoch 1/1
BATCH 600 700
Epoch 1/1
BATCH 700 800
Epoch 1/1
BATCH 800 900
Epoch 1/1
BATCH 900 1000
Epoch 1/1
BATCH 1000 1100
Epoch 1/1
