In [1]:
from flask import abort, Flask, jsonify, request
from symspellpy.symspellpy import SymSpell, Verbosity
from time import time
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import spacy
import pickle
import re

Using TensorFlow backend.


In [2]:
def clean_text(text):

    text = text.lower()
    text = re.sub(r'\bq\b|\bk\b', 'que', text) # replace q or x with que
    text = re.sub(r'\bd\b', 'de', text) # replace d with de
    text = re.sub(r'\bx\b', 'por', text) # replace x with por
    text = re.sub(r'\btmb\b', 'también', text) # replace tmb with tambien

    duplicates = re.compile(r'([^(c,l,n,r)L0])\1{1,}')
    double_clnr = re.compile(r"(.)\1{2,}")
    while duplicates.search(text)!=None:
        text = text.replace(duplicates.search(text).group(),duplicates.search(text).group()[0]) #remove multiple letters
    text = double_clnr.sub(r"\1\1", text) #except double c, l, n, and r

    text = re.sub(r'([ja]{5,}|[je]{5,}|[ji]{5,}|[ha]{5,}|[he]{5,})', 'jaja', text)  # remove dirty laughs

    text = re.sub(r'(\.|,|:|;|!|\?|\[|\]|\(|\))', ' ', text)  # replace simbols between words with spaces
    text = re.sub(r'\d+', '', text) #remove numbers

    text = re.sub(r'[%s]' % re.escape("""¿¡!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~…"""), '', text)  # remove punctuations
    text = re.sub(r'\b[^aeyou]\b', ' ', text) # remove single char
    text = re.sub('\s+', ' ', text)  # remove extra whitespace

    text = text.encode('latin', 'ignore').decode('latin')

    max_edit_distance_lookup = 3
    text = sym_spell.lookup_compound(text,max_edit_distance_lookup)[0].term

    tokens= nlp(u""+text)
    new_text= ' '.join([t.lemma_ for t in tokens])

    return [new_text]

In [3]:
nlp = spacy.load("es_core_news_md")

sym_spell = SymSpell(
    max_dictionary_edit_distance=3,
    prefix_length=7,
    count_threshold=1,
    compact_level=5,
)

sym_spell.load_dictionary(corpus='resources/es_real_freq_full.txt',term_index=0,count_index=1,encoding='utf-8')

True

In [4]:
model = load_model('NN_models/FastText_1CNN-BGRU_lemma.h5')
tokenizer = pickle.load(open('models/tokenizer_24_embedding_fasttext_1cnn-bgru.pkl','rb'))

W0829 12:12:47.375678  1580 deprecation_wrapper.py:119] From C:\anaconda3\envs\benja\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0829 12:12:47.624739  1580 deprecation_wrapper.py:119] From C:\anaconda3\envs\benja\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0829 12:12:47.750817  1580 deprecation_wrapper.py:119] From C:\anaconda3\envs\benja\lib\site-packages\keras\backend\tensorflow_backend.py:3976: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.

W0829 12:12:47.770796  1580 deprecation_wrapper.py:119] From C:\anaconda3\envs\benja\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0829 12:12:47.989521  1580 deprecation_wrapper.py:119] From C:\anaconda3\envs\benj

In [5]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 24, 300)           18405300  
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 22, 128)           115328    
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 11, 128)           0         
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 32)                13920     
_________________________________________________________________
dropout_7 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_16 (Dense)             (None, 8)                 264       
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 9         
Total para

In [6]:
df_validation = pd.read_csv('validation_sets/validation_labeled.csv',encoding='utf-8-sig')
df_test= pd.read_csv('test_sets/intertass_all_clean.csv',encoding='utf-8-sig')
df_test = df_test[df_test['sentiments'].isin(['P','N'])]

word_normalization = 'lemma_content_no'
max_len=24

X_validation,y_validation = df_validation[word_normalization],pd.Series([1 if i=='P' else 0 for i in df_validation['sentiment']],index = df_validation.index)
X_validation_tokens = tokenizer.texts_to_sequences(X_validation.values)
X_validation_pad = pad_sequences(X_validation_tokens,maxlen=max_len)

X_test,y_test = df_test[word_normalization],pd.Series([1 if i=='P' else 0 for i in df_test['sentiments']],index = df_test.index)
X_test_tokens = tokenizer.texts_to_sequences(X_test.values)
X_test_pad =  pad_sequences(X_test_tokens,maxlen=max_len)

pred1 = model.predict_classes(X_validation_pad)
pred2 = model.predict_classes(X_test_pad)

print("VALIDATION SET")
print(f"Confusion Matrix:\n\n{confusion_matrix(y_validation,pred1)}\n")   
print(classification_report(y_validation,pred1,labels=[0,1],target_names=['N','P'], digits = 3))  
print(f"\nModel Validation set Accuracy: {accuracy_score(y_validation, pred1)}\n") 


print("-"*80)
print("TEST VALIDATION SET")
print(f"Confusion Matrix:\n\n{confusion_matrix(y_test,pred2)}\n")   
print(classification_report(y_test,pred2,labels=[0,1],target_names=['N','P'],digits = 3))  
print(f"\nTest set Accuracy: {accuracy_score(y_test, pred2)}\n") 



VALIDATION SET
Confusion Matrix:

[[48  4]
 [16 31]]

              precision    recall  f1-score   support

           N      0.750     0.923     0.828        52
           P      0.886     0.660     0.756        47

   micro avg      0.798     0.798     0.798        99
   macro avg      0.818     0.791     0.792        99
weighted avg      0.814     0.798     0.794        99


Model Validation set Accuracy: 0.797979797979798

--------------------------------------------------------------------------------
TEST VALIDATION SET
Confusion Matrix:

[[1006  400]
 [ 180  943]]

              precision    recall  f1-score   support

           N      0.848     0.716     0.776      1406
           P      0.702     0.840     0.765      1123

   micro avg      0.771     0.771     0.771      2529
   macro avg      0.775     0.778     0.771      2529
weighted avg      0.783     0.771     0.771      2529


Test set Accuracy: 0.7706603400553579



In [7]:
df_validation['Model_score'] = model.predict(X_validation_pad)
df_validation['Model_value'] = ['P+' if t > 0.80 else 
                                'P' if 0.80>t>0.60 else 
                                'NEU' if 0.60>t>0.40 else
                                'N' if 0.40>t>0.20 else
                                'N+' for t in df_validation['Model_score']]

df_validation.head()

Unnamed: 0,id,sesion id,actividad,TEXTO,clean_content_no,clean_content,lemma_content_no,lemma_content,stemm_content_no,stemm_content,Etiquetador 1 (AMG),Etiquetador 2 (ICM),Etiquetador 3 (JGR),Etiquetador 4 (RECA),Etiquetador 5 (JH),sentiment,Model_score,Model_value
0,1239,210,Problematica,"nada, simplemente desepcionado de como.se.dan ...",nada simplemente decepcionado de como se dan l...,nada simplemente decepcionado,nadar simplemente decepcionar de comer se dar ...,nadar simplemente decepcionar,nad simplement decepcion de com se dan las cos,nad simplement decepcion,N,N,N,N,N,N,0.398659,N
1,1245,210,Explicacion,pues ver lo positivo de ello,pues ver lo positivo de ello,positivo,pues ver el positivo de él,positivo,pues ver lo posit de ello,posit,N,P,P,P,P,P,0.826883,P+
2,1248,210,Aprendizaje,darme un tiempo para refelxionar y no nada mas...,darme un tiempo para reflexionar y no nada mas...,darme tiempo reflexionar no nada irme precio p...,darme uno tiempo parir reflexionar y no nadar ...,darme tiempo reflexionar no nadar irme preciar...,darm un tiemp par reflexion y no nad mas irme ...,darm tiemp reflexion no nad irme preci pensamient,P,P,P,P,P,P,0.246394,N
3,1329,226,Obstaculo_Actividad,planeo realizarla por la tarde del día,planeo realizara por la tarde del día,planeo realizara,planear realizar por lo tardar del día,planear realizar,plane realiz por la tard del dia,plane realiz,P,P,N,N,P,P,0.651749,P
4,1345,255,Problematica,las personas a quienes atiendo me hacen enojar...,las personas a quienes atiendo me hacen enojar...,personas atiendo enojar no entienden instrucci...,los personar a quien atender me hacer enojar p...,personar atender enojar no entender instrucció...,las person a quien atiend me hac enoj porqu no...,person atiend enoj no entiend instruccion quier,N,N,N,N,N,N,0.243402,N


In [10]:
text = """no me gusto el cuarto
"""

start = time()
print(f"\nInput text: {text}")
clean = clean_text(text)
print(f"Text after preprocessing: {clean[0]}\n")
text_tokens = tokenizer.texts_to_sequences(clean)
text_pad = pad_sequences(text_tokens,maxlen=max_len)
score = model.predict(text_pad)[0][0]
print(f"Score {score}")
event = ("P+"if score>=0.8 else
         "P" if 0.60<=score<0.8 else
         "NEU" if 0.40<=score<0.60 else
         "N" if 0.20<=score<0.40 else
         "N+")
print(f"Event: {event}")
print(f"\nProcess took {time()-start} seconds to finish\n")


Input text: no me gusto el cuarto

Text after preprocessing: no me gustar el cuartar

Score 0.20955899357795715
Event: N

Process took 0.3620619773864746 seconds to finish

