In [1]:
from flask import abort, Flask, jsonify, request
from symspellpy.symspellpy import SymSpell, Verbosity
from time import time
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import spacy
import pickle
import re

Using TensorFlow backend.


In [2]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'\bq\b|\bk\b', 'que', text) # replace q or x with que
    text = re.sub(r'\bd\b', 'de', text) # replace d with de
    text = re.sub(r'\bx\b', 'por', text) # replace x with por
    text = re.sub(r'\btmb\b', 'también', text) # replace tmb with tambien

    duplicates = re.compile(r'([^(c,l,n,r)L0])\1{1,}')
    double_clnr = re.compile(r"(.)\1{2,}")
    while duplicates.search(text)!=None:
        text = text.replace(duplicates.search(text).group(),duplicates.search(text).group()[0]) #remove multiple letters
    text = double_clnr.sub(r"\1\1", text) #except double c, l, n, and r

    text = re.sub(r'([ja]{5,}|[je]{5,}|[ji]{5,}|[ha]{5,}|[he]{5,})', 'jaja', text)  # remove dirty laughs

    text = re.sub(r'(\.|,|:|;|!|\?|\[|\]|\(|\))', ' ', text)  # replace simbols between words with spaces
    text = re.sub(r'\d+', '', text) #remove numbers

    text = re.sub(r'[%s]' % re.escape("""¿¡!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~…"""), '', text)  # remove punctuations
    text = re.sub(r'\b[^aeyou]\b', ' ', text) # remove single char
    text = re.sub('\s+', ' ', text)  # remove extra whitespace

    text = text.encode('latin', 'ignore').decode('latin')

    max_edit_distance_lookup = 3
    text = sym_spell.lookup_compound(text,max_edit_distance_lookup)[0].term

    tokens= nlp(u""+text)
    new_text= ' '.join([t.lemma_ for t in tokens])

    return [new_text]

In [5]:
nlp = spacy.load("es_core_news_md")

sym_spell = SymSpell(
    max_dictionary_edit_distance=3,
    prefix_length=7,
    count_threshold=1,
    compact_level=5,
)

sym_spell.load_dictionary(corpus='resources/es_real_freq_full.txt',term_index=0,count_index=1,encoding='utf-8')

True

In [3]:
model = load_model('NN_models/Embedding_LSTM-GRU_lemma.h5')
tokenizer = pickle.load(open('models/tokenizer_embedding_lstm-gru.pkl','rb'))

W0717 11:38:37.048037  6532 deprecation_wrapper.py:119] From C:\Users\PREVENCION-1\Anaconda3\envs\ml_service\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0717 11:38:37.592608  6532 deprecation_wrapper.py:119] From C:\Users\PREVENCION-1\Anaconda3\envs\ml_service\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0717 11:38:37.691098  6532 deprecation_wrapper.py:119] From C:\Users\PREVENCION-1\Anaconda3\envs\ml_service\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0717 11:38:38.456313  6532 deprecation_wrapper.py:119] From C:\Users\PREVENCION-1\Anaconda3\envs\ml_service\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf

In [4]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 20, 100)           1200000   
_________________________________________________________________
lstm_10 (LSTM)               (None, 20, 192)           225024    
_________________________________________________________________
bidirectional_4 (Bidirection (None, 20, 32)            26752     
_________________________________________________________________
gru_5 (GRU)                  (None, 64)                18624     
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 65        
Total params: 1,470,465
Trainable params: 1,470,465
Non-trainable params: 0
_________________________________________________________________


In [114]:
max_len=20

text = """Que no debo imaginarme cosas negativas. Debo pensar positivo y creer que mi esposo me ama
"""

start = time()
print(f"\nInput text: {text}")
clean = clean_text(text)
print(f"Text after preprocessing: {clean[0]}\n")
text_tokens = tokenizer.texts_to_sequences(clean)
text_pad = pad_sequences(text_tokens,maxlen=max_len)
score = model.predict(text_pad)[0][0]
print(f"Score {score}")
event = ("Sentiment_P+"if score>=0.8 else
         "Sentiment_P" if 0.60<=score<0.8 else
         "Sentiment_NEU" if 0.40<=score<0.60 else
         "Sentiment_N" if 0.20<=score<0.40 else
         "Sentiment_N+")
print(f"Event: {event}")
print(f"\nProcess took {time()-start} seconds to finish\n")


Input text: Que no debo imaginarme cosas negativas. Debo pensar positivo y creer que mi esposo me ama

Text after preprocessing: que no deber imaginarme coser negativo deber pensar positivo y creer que mi esposar me amar

Score 0.2734161913394928
Event: Sentiment_N

Process took 6.668175935745239 seconds to finish

