In [None]:
'''
🔎 O que são Word Embeddings?
Word Embeddings transformam palavras em vetores densos e contínuos, onde o significado semântico é preservado pela distância entre os vetores.

✅ Exemplo:
king - man + woman ≈ queen

✅ Técnicas famosas:
Word2Vec
GloVe
FastText
Embedding Layer (Keras / Hugging Face)

📚 Como funciona o Word Embedding em uma Rede Neural
📌 Pipeline Geral:
Tokenização → números
Embedding Layer → vetor denso
Rede Neural (MLP, CNN, RNN, LSTM) → aprende padrões
Classificação / Regressão
'''

In [None]:
!pip install keras
!pip install tensorflow

In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Dataset de exemplo
texts = [
    "Eu amo esse filme",
    "Esse filme é péssimo",
    "Que filme maravilhoso",
    "Horrível, não gostei",
    "Gostei muito",
    "Péssimo, muito ruim"
]
labels = [1, 0, 1, 0, 1, 0]  # 1 = Positivo, 0 = Negativo

# 1. Tokenização
tokenizer = Tokenizer(num_words=1000)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)

# 2. Padding
X = pad_sequences(X, maxlen=10)

# 3. Construindo a Rede com Embedding
model = Sequential()
model.add(Embedding(input_dim=1000, output_dim=64, input_length=10))  # Word Embedding
model.add(LSTM(64))                                                  # Captura dependências
model.add(Dense(1, activation='sigmoid'))                            # Saída binária

# 4. Compilação
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 5. Treinamento
model.fit(X, np.array(labels), epochs=10, verbose=1)




Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step - accuracy: 0.8333 - loss: 0.6906
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - accuracy: 0.8333 - loss: 0.6883
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.8333 - loss: 0.6860
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 1.0000 - loss: 0.6837
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 0.6812
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 1.0000 - loss: 0.6785
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - accuracy: 1.0000 - loss: 0.6757
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 1.0000 - loss: 0.6726
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x7a3a763a7910>

In [2]:
# Teste com frase nova
new_text = ["O filme é maravilhoso"]
new_seq = tokenizer.texts_to_sequences(new_text)
new_seq = pad_sequences(new_seq, maxlen=10)

# Predição de sentimento
pred = model.predict(new_seq)
print(pred)  # Resultado: valor próximo de 1 (positivo)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 312ms/step
[[0.5119744]]


In [3]:
# Teste com frase nova
new_text = ["O filme é ruim"]
new_seq = tokenizer.texts_to_sequences(new_text)
new_seq = pad_sequences(new_seq, maxlen=10)

# Predição de sentimento
pred = model.predict(new_seq)
print(pred)  # Resultado: valor próximo de 1 (positivo)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
[[0.4881724]]


In [None]:
'''
💾 Resumo das Camadas

Camada - Função
Embedding - Transforma cada palavra num vetor de 64 dimensões (pré-treinado ou aprendido do zero)
LSTM - Captura o significado considerando a sequência (memória de longo prazo)
Dense - Classifica como positivo (1) ou negativo (0)


🎯 Vantagens de usar Embedding + Rede Neural
✅ O modelo aprende representações semânticas das palavras;
✅ Não é necessário manualmente calcular TF-IDF ou BoW;
✅ Captura contexto e ordem das palavras (coisa que BoW não faz).
'''

In [4]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [5]:
!pip install requests



In [6]:
# ✅ Como plugar FastText em uma rede neural (Keras)

import requests

x = requests.get('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.pt.300.vec.gz')
open('cc.pt.300.vec.gz', 'wb').write(x.content)

1271093660

In [8]:
!pip install --upgrade numpy
!pip install --upgrade gensim

Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.4 which is incom

In [1]:
!pip install --upgrade numpy gensim numba tensorflow

Collecting numpy
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)


In [2]:
# Passo 3 - Carregar o FastText no Python

from gensim.models import KeyedVectors

# Carrega o modelo FastText pré-treinado (em .vec ou .bin)
fasttext_model = KeyedVectors.load_word2vec_format('cc.pt.300.vec.gz', binary=False)


In [3]:
# Passo 4 - Criar a Embedding Matrix

import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

# Seu dataset
texts = ["Eu amo esse filme", "Esse filme é péssimo", "Que filme maravilhoso", "Horrível, não gostei"]
labels = [1, 0, 1, 0]

# Tokeniza o texto
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

# Cria a matriz de embeddings
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in word_index.items():
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]
    else:
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Caso não encontre


In [4]:
# Passo 5 - Construir o modelo usando essa embedding

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Padroniza o input
sequences = tokenizer.texts_to_sequences(texts)
X = pad_sequences(sequences, maxlen=10)

# Monta a rede neural com a Embedding pré-treinada
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],  # Pluga o FastText aqui
                    input_length=10,
                    trainable=False))  # Pode deixar False para não "estragar" o FastText
model.add(LSTM(64))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Treina o modelo
model.fit(X, np.array(labels), epochs=10, verbose=1)


Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.5000 - loss: 0.7032
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.5000 - loss: 0.6888
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 1.0000 - loss: 0.6748
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 1.0000 - loss: 0.6610
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 1.0000 - loss: 0.6471
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.6331
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 1.0000 - loss: 0.6187
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.6040
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms

<keras.src.callbacks.history.History at 0x7e92d6a0f2d0>

In [5]:
# Teste com frase nova
new_text = ["O filme é maravilhoso"]
new_seq = tokenizer.texts_to_sequences(new_text)
new_seq = pad_sequences(new_seq, maxlen=10)

# Predição de sentimento
pred = model.predict(new_seq)
print(pred)  # Resultado: valor próximo de 1 (positivo)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step
[[0.44475886]]


In [6]:
# Teste com frase nova
new_text = ["O filme é péssimo"]
new_seq = tokenizer.texts_to_sequences(new_text)
new_seq = pad_sequences(new_seq, maxlen=10)

# Predição de sentimento
pred = model.predict(new_seq)
print(pred)  # Resultado: valor próximo de 1 (positivo)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[[0.4218196]]


In [None]:
'''
📌 Resumo - O que você tem agora

✅ FastText plugado
✅ Embedding com 300 dimensões
✅ Rede LSTM aprendendo sobre a sequência
✅ Output de classificação (positivo ou negativo)
'''