# Word Embeeding e LSTM

Quando os dados são textos, devemos de alguma forma transformar estes em vetores numéricos para que a maioria dos modelos possam ser treinados. Veremos algumas formas de vetorizar texto.

Utilizaremos dados do Twitter que podem ser baixados em:

- https://www.kaggle.com/datasets/kazanova/sentiment140
- target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

In [16]:
import pandas as pd
import numpy as np

# texto
import re
import string
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import word_tokenize # Slow!
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# modelo
from sklearn.naive_bayes import BernoulliNB

# métricas
from sklearn.metrics import classification_report

# utils
from collections import Counter
from sklearn.model_selection import train_test_split

# keras
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Input, Flatten, GRU, Dense, LSTM, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model, Sequential

In [17]:
df = pd.read_csv(
    "../datasets/Tweets/tweets.csv", encoding='latin-1',
    names=["Sentiment", "ID", "Date", "Query", "User", "Tweet"]
)
df.head(3)

## Limpeza de dados textuais

In [None]:
# removendo colunas desnecessárias
df.drop(['ID', 'Date', 'Query', 'User'], axis=1, inplace=True)

In [None]:
# deixando todos os caracteres minúsculos
df['Tweet'] = df['Tweet'].map(str.lower)

In [None]:
# removendo pontuações e marcações de usuários
def RemovePunctuation(x):
    x = ' '.join(re.sub("(@[A-Za-z0-9]+)"," ",x).split())
    return re.sub("["+string.punctuation+"]", r" ", x)
df['Tweet'] = df['Tweet'].map(RemovePunctuation)

In [None]:
# removendo números
def RemoveNumbers(x):
    res = re.sub(r'[0-9]+',r' ',x)
    return res
df['Tweet'] = df['Tweet'].map(RemoveNumbers)

In [None]:
# removendo stopwords
df['Tweet'] = df['Tweet'].map(remove_stopwords)

In [None]:
# removendo caracteres isolados
def removeSingleChars(text):
    words = text.split()
    return " ".join([w for w in words if len(w) > 1])
df['Tweet'] = df['Tweet'].map(removeSingleChars)

In [None]:
### PULAR ###

# extraindo a raiz da palavra
# para português: # st = nltk.SnowballStemmer('portuguese')
# Lemmatization vs Stemmering (Lemma is a word)

st = PorterStemmer()
def stemming_on_text(data):
    data_split = data.split()
    text = [st.stem(word) for word in data_split]
    return text
df['Tweet']= df['Tweet'].map(stemming_on_text)

In [None]:
lm = WordNetLemmatizer()
def lemmatizer_on_text(data):
    data_split = data.split()
    text = [lm.lemmatize(word) for word in data_split]
    return text
df['Tweet'].map(lemmatizer_on_text)

0          [http, twitpic, com, zl, awww, bummer, shoulda...
1          [upset, update, facebook, texting, result, sch...
2            [dived, time, ball, managed, save, rest, bound]
3                                  [body, feel, itchy, like]
4                                            [behaving, mad]
                                 ...                        
1599995                [woke, having, school, best, feeling]
1599996    [thewdb, com, cool, hear, old, walt, interview...
1599997                 [ready, mojo, makeover, ask, detail]
1599998    [happy, th, birthday, boo, alll, time, tupac, ...
1599999                              [happy, charitytuesday]
Name: Tweet, Length: 1600000, dtype: object

In [None]:
df.to_csv('../datasets/Tweets/clean_tweets.csv', index=None, encoding="utf-8")

## Limpando a memória

In [None]:
import sys
def sizeof_fmt(num, suffix='B'):
    ''' by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, 'Yi', suffix)

for name, size in sorted(((name, sys.getsizeof(value)) for name, value in list(
                          locals().items())), key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

                             X: 691.3 MiB
                       X_train: 656.7 MiB
                        X_test: 34.6 MiB
                             y: 24.3 MiB
                       y_train: 23.0 MiB
                     all_words:  8.0 MiB
                        y_test:  1.2 MiB
                        y_pred: 621.0 KiB
                           _14:  2.4 KiB
                           _15:  2.4 KiB


In [None]:
del X
del X_train
del X_test

## Continuando ...

# Vetorização com TF-IDF

In [None]:
df = pd.read_csv('../datasets/Tweets/clean_tweets.csv')
df.dropna(inplace=True)
X = df['Tweet']
y = df['Sentiment']
del df
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.05, random_state =42)

https://towardsdatascience.com/tf-term-frequency-idf-inverse-document-frequency-from-scratch-in-python-6c2b61b78558

In [None]:
%% time
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)

In [None]:
%% time
X_train = vectoriser.transform(X_train)
X_test  = vectoriser.transform(X_test)

In [None]:
BNBmodel = BernoulliNB()
BNBmodel.fit(X_train, y_train)

In [None]:
y_pred = BNBmodel.predict(X_test)
print(classification_report(y_pred, y_test)) # 77%

              precision    recall  f1-score   support

           0       0.77      0.78      0.77     39447
           4       0.78      0.77      0.77     40033

    accuracy                           0.77     79480
   macro avg       0.77      0.77      0.77     79480
weighted avg       0.77      0.77      0.77     79480



## Vetorização com OneHot

In [17]:
df = pd.read_csv('../datasets/Tweets/clean_tweets.csv')
df.dropna(inplace=True)
X = df['Tweet']
y = df['Sentiment']

In [17]:
all_words = set()
for row in df['Tweet'].values:
    for word in row.split():
        all_words.add(word)
len(all_words)

In [None]:
%%time
# não haverá unicidade com 500
# mesma palavra mesmo encoding (não considera contexto)
# vetor muito grande se unicidade
df["Tweet"] = df["Tweet"].map(lambda x: one_hot(x, 500))

CPU times: total: 8.36 s
Wall time: 11.5 s


In [None]:
df["Tweet"].head()

0    [19, 151, 310, 370, 355, 200, 49, 304, 373, 31...
1             [271, 111, 282, 136, 318, 401, 346, 311]
2                    [379, 362, 265, 401, 499, 34, 47]
3                                  [347, 155, 192, 28]
4                                            [312, 62]
Name: Tweet, dtype: object

In [None]:
df["Tweet_count_words"] = df["Tweet"].map(len)
print(df['Tweet_count_words'].max(), df['Tweet_count_words'].min())

114 1


In [None]:
X = pad_sequences(
    df["Tweet"],
    maxlen=114,
    padding='post',
    truncating='post',
    value=0.0)
y = df['Sentiment']

In [None]:
X[:2]

array([[ 19, 151, 310, 370, 355, 200,  49, 304, 373, 310,  46,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [271, 111, 282, 136, 318, 401, 346, 311,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   

In [None]:
del df
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.05, random_state =42)

In [None]:
BNBmodel = BernoulliNB()
BNBmodel.fit(X_train, y_train)
y_pred = BNBmodel.predict(X_test)
print(classification_report(y_pred, y_test)) # 77%

              precision    recall  f1-score   support

           0       0.59      0.50      0.55     46864
           4       0.41      0.50      0.46     32616

    accuracy                           0.50     79480
   macro avg       0.50      0.50      0.50     79480
weighted avg       0.52      0.50      0.51     79480



## Exercícios

Resolva o mesmo problema transformando textos com as duas técnicas abaixo:

Count vectorizing (1,2 Ngrams), Bag of words e One Hot Encoding N-Grams
- https://medium.com/analytics-vidhya/fundamentals-of-bag-of-words-and-tf-idf-9846d301ff22

## Vetorização com Embeedings

https://www.kaggle.com/code/rajmehra03/a-detailed-explanation-of-keras-embedding-layer

In [6]:
df = pd.read_csv('clean_tweets.csv')
df.dropna(inplace=True)
X = df['Tweet']
y = df['Sentiment'].map({0:0, 4:1})

In [5]:
all_words = set()
for row in df['Tweet'].values:
    for word in row.split():
        all_words.add(word)
len(all_words)

314082

In [7]:
df["Tweet_count_words"] = df["Tweet"].map(len)
print(df['Tweet_count_words'].max(), df['Tweet_count_words'].min())

362 2


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.05, random_state =42)

In [9]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [10]:
word_index=tokenizer.word_index
vocab_size = len(word_index)+1

In [11]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [12]:
maxlen = 100
X_train_pad = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test_pad = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [13]:
model = Sequential([
    Embedding(vocab_size, 20, input_length=maxlen),
    Bidirectional(LSTM(64)),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

# compiles model
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [14]:
%%time
model.fit(X_train_pad, y_train, batch_size=512, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2
CPU times: user 2min 37s, sys: 5.75 s, total: 2min 42s
Wall time: 2min 56s


<keras.callbacks.History at 0x7c7a63f8dcc0>

In [15]:
y_pred = model.predict(X_test_pad)
print(classification_report(y_pred.ravel().round(0), y_test)) # 77%

              precision    recall  f1-score   support

         0.0       0.78      0.79      0.78     39349
         1.0       0.79      0.78      0.78     40131

    accuracy                           0.78     79480
   macro avg       0.78      0.78      0.78     79480
weighted avg       0.78      0.78      0.78     79480



- https://www.analyticsvidhya.com/blog/2022/01/sentiment-analysis-with-lstm/

# Atividade avaliativa

Faça uma submissão no desafio https://www.kaggle.com/competitions/dogs-vs-cats-redux-kernels-edition/submissions

- **Meta 10**: 0.4
- **Meta 07**: 1.2
- **Meta 04**: 7
- **Meta 00**: >= 18