**Álgebra Linear**

In [1]:
import numpy as np

vector1 = np.array([1., 2., 4., 3.])
vector2 = np.ones(4)
vector3 = np.zeros(4)

print(vector1)
print(vector2)
print(vector3)

[1. 2. 4. 3.]
[1. 1. 1. 1.]
[0. 0. 0. 0.]


**Soma**

In [2]:
vector1 + vector2

array([2., 3., 5., 4.])

**Subtração**

In [3]:
vector1 - vector2

array([0., 1., 3., 2.])

**Multiplicação**

In [4]:
vector1 * vector3

array([0., 0., 0., 0.])

**Divisão**

In [5]:
vector1 / vector2

array([1., 2., 4., 3.])

**Multiplicação de Matrizes**

In [6]:
np.dot(vector1, vector2)

10.0

**Similaridade por Cossenos**

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

po = np.array([5, 10])
mestre_tigresa = np.array([7.5, 2.5])

cosine_similarity([po], [mestre_tigresa])[0][0]

0.7071067811865475

**Representação One-Hot**

In [8]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

X = [["no"], ["meio"], ["do"], ["caminho"], ["tinha"], ["uma"], ["pedra"]]

enc.fit(X)
vocab = enc.categories_[0]
vetores = enc.transform(X).toarray()

vetores

array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0., 0.]])

**Vetor One-Hot de pedra**

In [9]:
vetores[np.where(vocab == 'pedra')[0][0]]

array([0., 0., 0., 0., 0., 1., 0.])

**Matriz de Frequência Termo-Documento**

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['no meio do caminho tinha uma pedra',
'tinha uma pedra no meio do caminho',
'tinha uma pedra',
'no meio do caminho tinha uma pedra']

vectorizer = CountVectorizer()

vetores = vectorizer.fit_transform(corpus)

vocab = vectorizer.get_feature_names_out()

print(vetores.toarray())
print(vocab)

[[1 1 1 1 1 1 1]
 [1 1 1 1 1 1 1]
 [0 0 0 0 1 1 1]
 [1 1 1 1 1 1 1]]
['caminho' 'do' 'meio' 'no' 'pedra' 'tinha' 'uma']


**Acessar o vetor da palavra meio**

In [11]:
vetores[:, np.where(vocab == 'meio')[0][0]].transpose().toarray()

array([[1, 1, 0, 1]])

**Acessando o vetor do verso 3: tinha uma pedra**

In [12]:
vetores[2, :].toarray()

array([[0, 0, 0, 0, 1, 1, 1]])

**Customizando o contador com um tokenizador próprio**

In [13]:
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer

def tokenize(texto):
  return nltk.word_tokenize(texto)

corpus = ['no meio do caminho tinha uma pedra',
'tinha uma pedra no meio do caminho',
'tinha uma pedra',
'no meio do caminho tinha uma pedra']

vectorizer = CountVectorizer(tokenizer=tokenize)

vetores = vectorizer.fit_transform(corpus)

vocab = vectorizer.get_feature_names_out()
vetores.toarray(), vocab

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


(array([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]]),
 array(['caminho', 'do', 'meio', 'no', 'pedra', 'tinha', 'uma'],
       dtype=object))

**Matriz de frequência termo-termo**

In [14]:
corpus = ['no meio do caminho tinha uma pedra',
'tinha uma pedra no meio do caminho',
'tinha uma pedra',
'no meio do caminho tinha uma pedra']

corpus_tok = [verso.split() for verso in corpus]

vocab = ["no", "meio", "do", "caminho", "tinha", "uma", "pedra"]
vetores = np.zeros((len(vocab), len(vocab)))

for verso in corpus_tok:
  for i, w1 in enumerate(vocab):
    for j, w2 in enumerate(vocab):
      if i != j:
        if w1 in verso and w2 in verso:
          vetores[i, j] += 1

print('Vocabulário: ')
print(vocab)
print()
print('Matriz: ')
print(vetores)

Vocabulário: 
['no', 'meio', 'do', 'caminho', 'tinha', 'uma', 'pedra']

Matriz: 
[[0. 3. 3. 3. 3. 3. 3.]
 [3. 0. 3. 3. 3. 3. 3.]
 [3. 3. 0. 3. 3. 3. 3.]
 [3. 3. 3. 0. 3. 3. 3.]
 [3. 3. 3. 3. 0. 4. 4.]
 [3. 3. 3. 3. 4. 0. 4.]
 [3. 3. 3. 3. 4. 4. 0.]]


**Remoção de palavras vazias (stopwords)**

In [15]:
import nltk
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
from sklearn.feature_extraction.text import CountVectorizer 

def tokenize(texto):
  return nltk.word_tokenize(texto)

corpus = ['no meio do caminho tinha uma pedra',
'tinha uma pedra no meio do caminho',
'tinha uma pedra',
'no meio do caminho tinha uma pedra']

vectorizer = CountVectorizer(tokenizer=tokenize, stop_words=stopwords)

vetores = vectorizer.fit_transform(corpus)

vocab = vectorizer.get_feature_names_out()
vetores.toarray(), vocab

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


(array([[1, 1, 1],
        [1, 1, 1],
        [0, 0, 1],
        [1, 1, 1]]),
 array(['caminho', 'meio', 'pedra'], dtype=object))

**TF-IDF**

In [16]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

corpus = ['ainda que mal pergunte',
'ainda que mal respondas',
'ainda que mal te entenda',
'ainda que mal repitas']

pipe = Pipeline([
    ('count', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])

vetores = pipe.fit_transform(corpus)
vocab = pipe['count'].get_feature_names_out()

print(vocab)
np.round(vetores.toarray(), 2)

['ainda' 'entenda' 'mal' 'pergunte' 'que' 'repitas' 'respondas' 'te']


array([[0.39, 0.  , 0.39, 0.74, 0.39, 0.  , 0.  , 0.  ],
       [0.39, 0.  , 0.39, 0.  , 0.39, 0.  , 0.74, 0.  ],
       [0.31, 0.6 , 0.31, 0.  , 0.31, 0.  , 0.  , 0.6 ],
       [0.39, 0.  , 0.39, 0.  , 0.39, 0.74, 0.  , 0.  ]])

In [17]:
verso1 = vetores[0, :]
verso2 = vetores[1, :]

cosine_similarity(verso1, verso2)[0][0]

0.4496288200064899

**Word Embeddings**

In [19]:
!wget http://143.107.183.175:22980/download.php?file=embeddings/word2vec/cbow_s50.zip
!unzip download.php?file=embeddings%2Fword2vec%2Fcbow_s50.zip

--2023-04-20 16:13:15--  http://143.107.183.175:22980/download.php?file=embeddings/word2vec/cbow_s50.zip
Connecting to 143.107.183.175:22980... connected.
HTTP request sent, awaiting response... 200 OK
Length: 170360268 (162M) [application/octet-stream]
Saving to: ‘download.php?file=embeddings%2Fword2vec%2Fcbow_s50.zip’


2023-04-20 16:13:30 (10.7 MB/s) - ‘download.php?file=embeddings%2Fword2vec%2Fcbow_s50.zip’ saved [170360268/170360268]

Archive:  download.php?file=embeddings%2Fword2vec%2Fcbow_s50.zip
  inflating: cbow_s50.txt            


**Inicializando os word embeddings**

In [20]:
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('cbow_s50.txt')

**Acessando o word embedding da palavra menino**

In [21]:
word2vec["menino"]

array([ 0.047754, -0.190243,  0.290581,  0.035822,  0.2301  , -0.139099,
       -0.232351, -0.119084,  0.327645,  0.160017, -0.5318  ,  0.093309,
       -0.545777, -0.166715,  0.044872, -0.094386, -0.017529, -0.053898,
        0.189092, -0.233779, -0.302459,  0.707696, -0.146762,  0.258651,
        0.25436 , -0.071892,  0.132296, -0.072721,  0.162642,  0.348834,
        0.129191, -0.030967,  0.048024,  0.26683 , -0.076066,  0.352168,
        0.629779, -0.403468, -0.473612,  0.456509,  0.008285,  0.066872,
        0.082632, -0.128989,  0.107645,  0.119981,  0.219388, -0.141599,
       -0.20074 , -0.30657 ], dtype=float32)

**Palavras mais semelhantes ao verbo estudar**

In [22]:
word2vec.most_similar("estudar")

[('pesquisar', 0.8674625158309937),
 ('ensinar', 0.8485606908798218),
 ('leccionar', 0.8399008512496948),
 ('moldar', 0.8285380601882935),
 ('desenvolver', 0.8203483819961548),
 ('focalizar', 0.8188527226448059),
 ('cursar', 0.8175848126411438),
 ('projectar', 0.8173723816871643),
 ('desenhar', 0.8155598044395447),
 ('enriquecer', 0.8142802715301514)]

**Similaridade por cosseno entre os word embeddings das palavras menino e cachorro**

In [23]:
word2vec.similarity("menino", "cachorro")

0.8441181

**Inferência lógica para: odiar está para odiando, assim como amar está para...**

In [24]:
word2vec.most_similar(positive=["amar", "odiando"], negative=["odiar"])

[('amando', 0.7472065687179565),
 ('desperto', 0.7231095433235168),
 ('quieto', 0.6835169196128845),
 ('tranqüilo', 0.6812532544136047),
 ('surdo', 0.6798273921012878),
 ('louco', 0.6784767508506775),
 ('quieta', 0.6757060289382935),
 ('sã³brio', 0.6748781204223633),
 ('rouco', 0.6719405651092529),
 ('sossegado', 0.6716687679290771)]

**BERTimbau**

In [25]:
!pip3 install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [26]:
import torch
from transformers import AutoTokenizer #or BertTokenizer
from transformers import AutoModelForPreTraining #or BertForPreTraining for loading pretraining heads
from transformers import AutoModel #or BertModel, for BERT without pretraining heads

In [27]:
device = 'cpu' #torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-large-portuguese-cased', do_lower_case=False)
bert = AutoModel.from_pretrained('neuralmind/bert-large-portuguese-cased')
bert = bert.to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at neuralmind/bert-large-portuguese-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
texto = 'Eu vou ao banco pagar a conta hoje.'

#tokenizando o texto
input_ids = tokenizer.encode(texto, return_tensors='pt')
wordpieces = tokenizer.convert_ids_to_tokens(input_ids[0])

#salvando ponteiros para palavras
subwords_idx = [] #primeira subpalavra de cada palavra
for i, wordpiece in enumerate(wordpieces):
  if '##' not in wordpiece and i not in [0, len(wordpieces)-1]:
    subwords_idx.append(i)

#obtendo os vetores para as palavras
inputs_ids = input_ids.to(device)
with torch.no_grad():
  outs = bert(input_ids)
  vetores = outs[0][0, :]

vetores[subwords_idx], vetores[subwords_idx].size()

(tensor([[ 0.7381,  0.6351,  0.4160,  ..., -0.5735, -0.9812, -0.6793],
         [ 0.6583, -0.0975,  0.2579,  ..., -0.7367, -0.8734,  0.1407],
         [ 0.7682,  0.2151,  0.1769,  ...,  1.0626, -0.2526, -0.3107],
         ...,
         [-0.0765,  0.4402, -0.5236,  ...,  0.8389, -0.3389, -0.8167],
         [ 0.3460,  1.8568, -0.0262,  ..., -0.0021, -0.0928,  0.0079],
         [ 0.8516,  1.1038, -1.1464,  ...,  0.4630, -0.4967,  0.2980]]),
 torch.Size([9, 1024]))