In [None]:
from flask import abort, Flask, jsonify, request
from symspellpy.symspellpy import SymSpell, Verbosity
from time import time
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import spacy
import pickle
import re

Using TensorFlow backend.


In [None]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'\bq\b|\bk\b', 'que', text) # replace q or x with que
    text = re.sub(r'\bd\b', 'de', text) # replace d with de
    text = re.sub(r'\bx\b', 'por', text) # replace x with por
    text = re.sub(r'\btmb\b', 'también', text) # replace tmb with tambien

    duplicates = re.compile(r'([^(c,l,n,r)L0])\1{1,}')
    double_clnr = re.compile(r"(.)\1{2,}")
    while duplicates.search(text)!=None:
        text = text.replace(duplicates.search(text).group(),duplicates.search(text).group()[0]) #remove multiple letters
    text = double_clnr.sub(r"\1\1", text) #except double c, l, n, and r

    text = re.sub(r'([ja]{5,}|[je]{5,}|[ji]{5,}|[ha]{5,}|[he]{5,})', 'jaja', text)  # remove dirty laughs

    text = re.sub(r'(\.|,|:|;|!|\?|\[|\]|\(|\))', ' ', text)  # replace simbols between words with spaces
    text = re.sub(r'\d+', '', text) #remove numbers

    text = re.sub(r'[%s]' % re.escape("""¿¡!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~…"""), '', text)  # remove punctuations
    text = re.sub(r'\b[^aeyou]\b', ' ', text) # remove single char
    text = re.sub('\s+', ' ', text)  # remove extra whitespace

    text = text.encode('latin', 'ignore').decode('latin')

    max_edit_distance_lookup = 3
    text = sym_spell.lookup_compound(text,max_edit_distance_lookup)[0].term

    tokens= nlp(u""+text)
    new_text= ' '.join([t.lemma_ for t in tokens])

    return [new_text]

In [None]:
nlp = spacy.load("es_core_news_md")

sym_spell = SymSpell(
    max_dictionary_edit_distance=3,
    prefix_length=7,
    count_threshold=1,
    compact_level=5,
)

sym_spell.load_dictionary(corpus='resources/es_real_freq_full.txt',term_index=0,count_index=1,encoding='utf-8')

In [None]:
model = load_model('NN_models/FastText_1CNN-BGRU_lemma.h5')
tokenizer = pickle.load(open('models/tokenizer_24_embedding_fasttext_1cnn-bgru.pkl','rb'))

In [None]:
model.summary()

In [None]:
df_validation = pd.read_csv('test_sets/validation_labeled.csv',encoding='utf-8-sig')
df_test= pd.read_csv('test_sets/intertass_all_clean.csv',encoding='utf-8-sig')
df_test = df_test[df_test['sentiments'].isin(['P','N'])]

word_normalization = 'lemma_content_no'
max_len=24

X_validation,y_validation = df_validation[word_normalization],pd.Series([1 if i=='P' else 0 for i in df_validation['sentiment']],index = df_validation.index)
X_validation_tokens = tokenizer.texts_to_sequences(X_validation.values)
X_validation_pad = pad_sequences(X_validation_tokens,maxlen=max_len)

X_test,y_test = df_test[word_normalization],pd.Series([1 if i=='P' else 0 for i in df_test['sentiments']],index = df_test.index)
X_test_tokens = tokenizer.texts_to_sequences(X_test.values)
X_test_pad =  pad_sequences(X_test_tokens,maxlen=max_len)

pred1 = model.predict_classes(X_validation_pad)
pred2 = model.predict_classes(X_test_pad)

print("VALIDATION SET")
print(f"Confusion Matrix:\n\n{confusion_matrix(y_validation,pred1)}\n")   
print(classification_report(y_validation,pred1,labels=[0,1],target_names=['N','P'], digits = 3))  
print(f"\nModel Validation set Accuracy: {accuracy_score(y_validation, pred1)}\n") 


print("-"*80)
print("TEST VALIDATION SET")
print(f"Confusion Matrix:\n\n{confusion_matrix(y_test,pred2)}\n")   
print(classification_report(y_test,pred2,labels=[0,1],target_names=['N','P'],digits = 3))  
print(f"\nTest set Accuracy: {accuracy_score(y_test, pred2)}\n") 



In [None]:
df_validation['Model_score'] = model.predict(X_validation_pad)
df_validation['Model_value'] = ['P+' if t > 0.80 else 
                                'P' if 0.80>t>0.60 else 
                                'NEU' if 0.60>t>0.40 else
                                'N' if 0.40>t>0.20 else
                                'N+' for t in df_validation['Model_score']]

df_validation.head()

In [None]:
text = """ Soy una persona valiosa y especial. Debo aprender a quererme más
"""

start = time()
print(f"\nInput text: {text}")
clean = clean_text(text)
print(f"Text after preprocessing: {clean[0]}\n")
text_tokens = tokenizer.texts_to_sequences(clean)
text_pad = pad_sequences(text_tokens,maxlen=max_len)
score = model.predict(text_pad)[0][0]
print(f"Score {score}")
event = ("P+"if score>=0.8 else
         "P" if 0.60<=score<0.8 else
         "NEU" if 0.40<=score<0.60 else
         "N" if 0.20<=score<0.40 else
         "N+")
print(f"Event: {event}")
print(f"\nProcess took {time()-start} seconds to finish\n")