## Carga de paquetes

In [122]:
import pandas as pd
import numpy as np
import re
import unidecode
import string
import nltk
import emoji
import csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

In [2]:
pd.set_option('max_colwidth', 500)

## Importar datos dentro de un dataframe de Pandas

In [3]:
df = pd.read_csv('data/simonve_data.csv', sep=';', usecols=['comment_id', 'text', 'polarity'])

In [4]:
df.head()

Unnamed: 0,comment_id,text,polarity
0,38768,donde lo compra a ese precio???? ..... le estan viendo la cara.,10
1,41962,"David Foronda (Podemos): “A favor de regular los transgénicos, hay que tener en cuenta la soberanía alimentar... https://t.co/9dhQHZNfXn",-1
2,46156,"RT @segbruce: 21% la aprueba, 73% la rechaza. Así como va, Michelle Bachelet desaparecerá de las encuestas, ya no la apoyan ni sus votante…",-1
3,52072,RT @rpl2010: #CUIDATUDINEROMV #LANUEVAMAYORÍA O #LAVIEJAPILLERÍA #AFP #INJUSTAS EDUARDO #FREI RICARDO #LAGOS Y MICHELLE #BACHELET https://t…,-1
4,40533,"RT @Chevige: Ricardo Lagos el mismo que utilizó la LEY ANTITERRORISTA DE PINOCHET para reprimir, aplaude a Almagro #NuevaMayoria https://t.…",-1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43527 entries, 0 to 43526
Data columns (total 3 columns):
comment_id    43527 non-null int64
text          43522 non-null object
polarity      43527 non-null int64
dtypes: int64(2), object(1)
memory usage: 1020.2+ KB


## Conteo del número de clasificaciones por comentario

In [6]:
classifications_by_comment = df.groupby(['comment_id'])['comment_id'].agg(['count'])
classifications_by_comment.head()

Unnamed: 0_level_0,count
comment_id,Unnamed: 1_level_1
34624,4
34693,3
34772,3
34863,3
35030,3


## Remoción de comentarios con solo una clasificación

In [7]:
df = df[df.apply(lambda r: classifications_by_comment.loc[r['comment_id']]['count'] > 1, axis=1)]

## Cálculo del valor de clasificación más frecuente

In [8]:
df = df.groupby(['comment_id', 'text'])[['polarity']].agg(pd.Series.mode)

In [9]:
df = df.reset_index()

In [10]:
df.drop(columns=['comment_id'], inplace=True)

In [11]:
df.head()

Unnamed: 0,text,polarity
0,Abogado de Michelle Bachelet otorgó asesoría jurídica a mujer que realizó la denuncia. https://t.co/lV5gnWcfmm,1
1,RT @Alitop_: Faltan 635 dias para que se acabe esta pesadilla llamada Michelle Bachelet #CuentaRegresiva #ChaoBachelet,-1
2,Michelle Bachelet está trotando para estar en forma. Michelle Bachelet está tratando de aprobar sus reformas chavo!! https://t.co/0QAX2Hu2Gh,0
3,"RT @ElLibido: 2/15 Hace pocos días, los “amigos” de @derechatuitera masificaron imagen, sobre supuesto vino de Michelle Bachelet. https://t…",-1
4,"Alcalde de Pozo Almonte, José Fernando Muñoz junto a la Presidenta, Michelle Bachelet e Intendenta de Tarapacá. https://t.co/v1IxZ4D3aG",0


In [12]:
df.count()

text        13711
polarity    13711
dtype: int64

## Remoción de comentarios con más de una clasificación más frecuente (moda)

In [13]:
df = df[df.apply(lambda r: type(r['polarity']) != pd.np.ndarray, axis=1)]

In [14]:
df.count()

text        12685
polarity    12685
dtype: int64

In [15]:
df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5485
0,4647
1,2497
10,56


## Remoción de los indefinidos

In [16]:
df = df[~(df.polarity == 10)]

In [17]:
df.count()

text        12629
polarity    12629
dtype: int64

In [18]:
df.head()

Unnamed: 0,text,polarity
0,Abogado de Michelle Bachelet otorgó asesoría jurídica a mujer que realizó la denuncia. https://t.co/lV5gnWcfmm,1
1,RT @Alitop_: Faltan 635 dias para que se acabe esta pesadilla llamada Michelle Bachelet #CuentaRegresiva #ChaoBachelet,-1
2,Michelle Bachelet está trotando para estar en forma. Michelle Bachelet está tratando de aprobar sus reformas chavo!! https://t.co/0QAX2Hu2Gh,0
3,"RT @ElLibido: 2/15 Hace pocos días, los “amigos” de @derechatuitera masificaron imagen, sobre supuesto vino de Michelle Bachelet. https://t…",-1
4,"Alcalde de Pozo Almonte, José Fernando Muñoz junto a la Presidenta, Michelle Bachelet e Intendenta de Tarapacá. https://t.co/v1IxZ4D3aG",0


## Funciones de limpieza de texto

In [112]:
def remove_tweet_user_mentions(s):
    """Remove twitter users from text."""
    return re.sub(r'@\S+', ' ', s, re.UNICODE)

def remove_tweet_hashtags(s):
    """Remove tweet hashtags."""
    return re.sub(r'#\S+', ' ', s, re.UNICODE)

def remove_links(s):
    """Remove links, urls from text."""
    return re.sub(r'(http|https)\S+', ' ', s, re.UNICODE)

def normalize_string(s):
    """# To lower case and remove accents marks"""
    return unidecode.unidecode(s)

def to_lower(s):
    return s.lower()

def remove_emojis(s):
    """Remove emojis (emoticons) from text."""
    return emoji.get_emoji_regexp().sub(u' ', s)

def remove_punctuation(s):
    punctuation_symbols = string.punctuation
    punctuation_symbols += '¿¡'
    s = s.replace('\u2026', ' ')  # triple dots
    s = s.replace('\u25ba', ' ')  # BLACK RIGHT-POINTING POINTER
    s = s.replace('\u201c', ' ')  # left double quote
    s = s.replace('\u201d', ' ')  # right double quote
    return s.translate(str.maketrans(' ', ' ', punctuation_symbols))

def remove_numbers(s):
    return re.sub(r'\d+', ' ', s)

def remove_stopwords(s):
    stoplist = nltk.corpus.stopwords.words('spanish')
    stoplist.remove('no')
    stoplist.extend([
        'rt',
        'q',
        'd',
        'x'
    ])
    return u' '.join([w for w in s.split() if w not in stoplist])

def remove_extra_whites(s):
    """Remove white characters repetitions from text."""
    return ' '.join(s.split())

def normalize(s):
    s = remove_tweet_user_mentions(s)
    s = remove_tweet_hashtags(s)
    s = remove_links(s)
    s = to_lower(s)
    s = remove_punctuation(s)
    s = remove_stopwords(s)
    s = remove_emojis(s)
    s = remove_numbers(s)
    s = remove_extra_whites(s)
    return s

## Limpieza de texto

In [20]:
normalized_df = df.copy()

In [21]:
normalized_df['text'] = normalized_df['text'].map(normalize)

In [22]:
normalized_df.head()

Unnamed: 0,text,polarity
0,abogado michelle bachelet otorgó asesoría jurídica mujer realizó denuncia,1
1,faltan dias acabe pesadilla llamada michelle bachelet,-1
2,michelle bachelet trotando forma michelle bachelet tratando aprobar reformas chavo,0
3,hace pocos días amigos masificaron imagen supuesto vino michelle bachelet,-1
4,alcalde pozo almonte josé fernando muñoz junto presidenta michelle bachelet intendenta tarapacá,0


## Remoción de comentarios con solo una palabra

In [23]:
normalized_df = normalized_df[normalized_df.apply(lambda r: len(r.text.split()) > 1, axis=1)]

In [24]:
normalized_df.count()

text        12436
polarity    12436
dtype: int64

## Datasets de entrenamiento y prueba

In [25]:
X = normalized_df.text.tolist()
y = normalized_df.polarity.astype(int).tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Naive Bayes

In [26]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [27]:
y_pred = nb.predict(X_test)

In [28]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.610128617363344


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.60      0.87      0.71      1112
           0       0.59      0.53      0.56       881
           1       0.85      0.17      0.29       495

    accuracy                           0.61      2488
   macro avg       0.68      0.52      0.52      2488
weighted avg       0.65      0.61      0.57      2488



## Support Vector Machine

In [30]:
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [31]:
y_pred = sgd.predict(X_test)

In [32]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.6089228295819936


In [33]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.59      0.89      0.71      1112
           0       0.63      0.41      0.50       881
           1       0.67      0.34      0.45       495

    accuracy                           0.61      2488
   macro avg       0.63      0.55      0.55      2488
weighted avg       0.62      0.61      0.58      2488



## Regresión Logística

In [34]:
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
logreg.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=100000.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=

In [35]:
y_pred = logreg.predict(X_test)

In [36]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5727491961414791


In [37]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.66      0.66      0.66      1112
           0       0.50      0.54      0.52       881
           1       0.51      0.43      0.47       495

    accuracy                           0.57      2488
   macro avg       0.56      0.54      0.55      2488
weighted avg       0.57      0.57      0.57      2488



## Naive Bayes - solo positivos y negativos

In [38]:
no_neutral_df = normalized_df[~(normalized_df.polarity == 0)].copy()

In [39]:
no_neutral_df.groupby(['polarity']).count()

Unnamed: 0_level_0,text
polarity,Unnamed: 1_level_1
-1,5433
1,2446


In [40]:
X_no_neutral = no_neutral_df.text
y_no_neutral = no_neutral_df.polarity.astype(int)
X_train_no_neutral, X_test_no_neutral, y_train_no_neutral, y_test_no_neutral = train_test_split(X_no_neutral, y_no_neutral, test_size=0.2, random_state=42)

In [41]:
nb_no_neutral = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb_no_neutral.fit(X_train_no_neutral, y_train_no_neutral)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [42]:
y_pred_no_neutral = nb_no_neutral.predict(X_test_no_neutral)

In [43]:
print('accuracy %s' % accuracy_score(y_pred_no_neutral, y_test_no_neutral))

accuracy 0.7848984771573604


In [44]:
print(classification_report(y_test_no_neutral, y_pred_no_neutral))

              precision    recall  f1-score   support

          -1       0.77      0.99      0.87      1106
           1       0.93      0.30      0.45       470

    accuracy                           0.78      1576
   macro avg       0.85      0.65      0.66      1576
weighted avg       0.82      0.78      0.74      1576



## Naive Bayes - Ngrams

In [45]:
nb = Pipeline([('vect', CountVectorizer(ngram_range=(1, 3))),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
])
nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [46]:
y_pred = nb.predict(X_test)

In [47]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5928456591639871


## Random Forest

In [48]:
rf = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', RandomForestClassifier(n_estimators=100)),
])
rf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0

In [49]:
y_pred = nb.predict(X_test)

In [50]:
print('accuracy %s' % accuracy_score(y_pred, y_test))

accuracy 0.5928456591639871


## Deep Learning

In [144]:
X = normalized_df.text.values
y = pd.get_dummies(normalized_df.polarity).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Encoding

In [149]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [150]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)

In [151]:
X_train[2]

'final nefasto gobierno bachelet vamos terminar comiéndonos mocos orrego'

In [152]:
X_train_sequences[2]

[262, 826, 15, 3, 107, 473, 9910, 6360, 6361]

In [153]:
X_test_sequences = tokenizer.texts_to_sequences(X_test)

In [154]:
for word in ['bachelet', 'final', 'orrego']:
    print('{}: {}'.format(word, tokenizer.word_index[word]))

bachelet: 3
final: 262
orrego: 6361


In [155]:
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

Dado que las secuencias generadas con texts_to_sequences no poseen un largo uniforme, se utiliza pad_sequence para remediar dicho resultado mediante la adición de ceros a las secuencias hasta homogeneizar el largo de estas últimas.

In [156]:
max_sequence_length = max(len(t) for t in X_train_sequences)
max_sequence_length

578

In [157]:
X_train_padded_sequences = pad_sequences(X_train_sequences, padding='post', maxlen=max_sequence_length)
X_test_padded_sequences = pad_sequences(X_test_sequences, padding='post', maxlen=max_sequence_length)

In [158]:
X_train_padded_sequences.shape

(9948, 578)

In [159]:
X_train_padded_sequences[1,:]

array([   3,  530,   57,  128,   50,    1,   77, 9909,   23,  222,  195,
         94,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

### Embeding Layer

+ [Artículo relevante](https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer)
+ [Documentación embedding layer](https://keras.io/layers/embeddings/)

In [84]:
from keras.models import Sequential
from keras import layers

In [160]:
embedding_dim = 50

In [161]:
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=max_sequence_length))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 578, 50)           1286300   
_________________________________________________________________
flatten_11 (Flatten)         (None, 28900)             0         
_________________________________________________________________
dense_21 (Dense)             (None, 10)                289010    
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 33        
Total params: 1,575,343
Trainable params: 1,575,343
Non-trainable params: 0
_________________________________________________________________


In [162]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=10)

Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [163]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9941
Testing Accuracy:  0.5796


## Embeddings Preentrenados

### Word2Vec

In [164]:
from gensim.models.keyedvectors import KeyedVectors

Características del embedding: 
+ #dimensions = 300
+ #vectors = 1000653

In [68]:
wordvectors_file_vec = 'embeddings/SBW-vectors-300-min5.txt'
wordvectors = KeyedVectors.load_word2vec_format(wordvectors_file_vec)

### Palabra dentro del vocabulario

In [165]:
wordvectors['de']

array([-2.96480e-02,  1.13360e-02,  1.99490e-02, -8.88320e-02,
       -2.52250e-02,  5.68440e-02,  2.54730e-02,  1.40680e-02,
        1.63694e-01, -6.71540e-02,  1.47380e-02,  2.71340e-02,
        6.64430e-02, -4.48460e-02, -4.49870e-02, -4.08980e-02,
        3.03110e-02,  3.41960e-02, -4.92400e-02,  8.53700e-03,
       -6.80910e-02, -8.79380e-02,  3.53000e-02,  1.49385e-01,
       -1.23500e-02,  1.26130e-02,  2.93500e-02,  6.95960e-02,
        3.91110e-02,  5.76520e-02,  6.99540e-02, -6.62170e-02,
       -4.17840e-02,  2.86230e-02,  2.67720e-02, -6.63920e-02,
        2.95300e-03, -1.21880e-02, -3.03630e-02,  4.02220e-02,
        3.48580e-02,  2.74690e-02, -2.90340e-02, -4.87480e-02,
       -3.85820e-02, -5.15530e-02, -3.35010e-02, -1.90080e-02,
        3.04300e-03,  1.10712e-01, -2.50960e-02,  1.11082e-01,
        3.52440e-02,  1.14207e-01,  1.01950e-02,  5.15110e-02,
       -4.06490e-02, -1.13944e-01,  4.48730e-02,  5.20110e-02,
        6.73600e-02,  4.90540e-02, -1.27085e-01, -3.184

### palabra fuera del vocabulario

In [77]:
wordvectors['bachelet']

KeyError: "word 'bachelet' not in vocabulary"

In [190]:
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
embedding_matrix.shape

(25726, 300)

Obtención de los vectores para el vocabulario del corpus de entrenamiento, desde el modelo word2vect preentrenado. Si no se encuentra el vector para alguna palabra (Out of Vocabulary Word), se genera uno aleatorio. 

In [191]:
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    try:
        embedding_vector = wordvectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [192]:
embedding_matrix.shape

(25726, 300)

In [193]:
model = Sequential()
model.add(layers.Embedding(
                    input_dim=vocab_size, 
                    output_dim=EMBEDDING_DIM, 
                    input_length=max_sequence_length,
                    weights=[embedding_matrix],
                    trainable=True
                )
)
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 578, 300)          7717800   
_________________________________________________________________
flatten_17 (Flatten)         (None, 173400)            0         
_________________________________________________________________
dense_33 (Dense)             (None, 10)                1734010   
_________________________________________________________________
dense_34 (Dense)             (None, 3)                 33        
Total params: 9,451,843
Trainable params: 9,451,843
Non-trainable params: 0
_________________________________________________________________


In [194]:
history = model.fit(X_train_padded_sequences, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test_padded_sequences, y_test),
                    batch_size=10)

Train on 9948 samples, validate on 2488 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [195]:
loss, accuracy = model.evaluate(X_train_padded_sequences, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test_padded_sequences, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9943
Testing Accuracy:  0.5744


### Con stopwords

In [173]:
normalized_sw_df = df.copy()
def normalize_with_sw(s):
    s = remove_tweet_user_mentions(s)
    s = remove_tweet_hashtags(s)
    s = remove_links(s)
    s = to_lower(s)
    s = remove_punctuation(s)
    s = remove_emojis(s)
    s = remove_numbers(s)
    s = remove_extra_whites(s)
    return s
    
normalized_sw_df['text'] = normalized_sw_df['text'].map(normalize_with_sw)
normalized_sw_df.head()

Unnamed: 0,text,polarity
0,abogado de michelle bachelet otorgó asesoría jurídica a mujer que realizó la denuncia,1
1,rt faltan dias para que se acabe esta pesadilla llamada michelle bachelet,-1
2,michelle bachelet está trotando para estar en forma michelle bachelet está tratando de aprobar sus reformas chavo,0
3,rt hace pocos días los amigos de masificaron imagen sobre supuesto vino de michelle bachelet,-1
4,alcalde de pozo almonte josé fernando muñoz junto a la presidenta michelle bachelet e intendenta de tarapacá,0


In [180]:
X_sw = normalized_sw_df.text.values
y_sw = pd.get_dummies(normalized_sw_df.polarity).values
X_sw_train, X_sw_test, y_sw_train, y_sw_test = train_test_split(X_sw, y_sw, test_size=0.2, random_state=42)

In [181]:
tokenizer_sw = Tokenizer()
tokenizer_sw.fit_on_texts(X_sw_train)
X_sw_train_sequences = tokenizer_sw.texts_to_sequences(X_sw_train)
X_sw_test_sequences = tokenizer_sw.texts_to_sequences(X_sw_test)
vocab_size_sw = len(tokenizer_sw.word_index) + 1  # Adding 1 because of reserved 0 index
max_sequence_length_sw = max(len(t) for t in X_sw_train_sequences)
X_sw_train_padded_sequences = pad_sequences(X_sw_train_sequences, padding='post', maxlen=max_sequence_length_sw)
X_sw_test_padded_sequences = pad_sequences(X_sw_test_sequences, padding='post', maxlen=max_sequence_length_sw)

In [185]:
EMBEDDING_DIM = 300
embedding_matrix_sw = np.zeros((vocab_size_sw, EMBEDDING_DIM))
embedding_matrix_sw.shape

for word, i in tokenizer.word_index.items():
    if i >= vocab_size_sw:
        continue
    try:
        embedding_vector = wordvectors[word]
        embedding_matrix_sw[i] = embedding_vector
    except KeyError:
        embedding_matrix_sw[i] = np.random.normal(0, np.sqrt(0.25), EMBEDDING_DIM)

In [188]:
model = Sequential()
model.add(layers.Embedding(
                    input_dim=vocab_size_sw, 
                    output_dim=EMBEDDING_DIM, 
                    input_length=max_sequence_length_sw,
                    weights=[embedding_matrix_sw],
                    trainable=False
                )
)
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 400, 300)          7784400   
_________________________________________________________________
flatten_16 (Flatten)         (None, 120000)            0         
_________________________________________________________________
dense_31 (Dense)             (None, 10)                1200010   
_________________________________________________________________
dense_32 (Dense)             (None, 3)                 33        
Total params: 8,984,443
Trainable params: 1,200,043
Non-trainable params: 7,784,400
_________________________________________________________________


In [189]:
history = model.fit(X_sw_train_padded_sequences, y_sw_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_sw_test_padded_sequences, y_sw_test),
                    batch_size=10)

Train on 10103 samples, validate on 2526 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### Convolutional Neural Networks

[model reference](https://arxiv.org/abs/1408.5882)

In [110]:
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers