In [1]:
document_corpus = ["this is good phone phone" , 
                   "this is bad mobile mobile" , 
                   "she is good good cat" , 
                   "he has bad temper temper" , 
                   "this mobile phone phone is not good good"]

In [2]:
data_corpus = set()
for row in document_corpus:
    for word in row.split(" "):
        if word not in data_corpus:
            data_corpus.add(word)

data_corpus=sorted(data_corpus)

print(data_corpus)

['bad', 'cat', 'good', 'has', 'he', 'is', 'mobile', 'not', 'phone', 'she', 'temper', 'this']


# Index Based Encoding

In [3]:
res = len(max(document_corpus, key = len).split(" "))
print(res)

8


In [4]:
index_based_encoding=[]
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for i in range(res):
        if i <= len(split)-1:
            row_encoding.append(data_corpus.index(split[i])+1)
        else:
            row_encoding.append(0)
    index_based_encoding.append(row_encoding)

print(index_based_encoding)

[[12, 6, 3, 9, 9, 0, 0, 0], [12, 6, 1, 7, 7, 0, 0, 0], [10, 6, 3, 3, 2, 0, 0, 0], [5, 4, 1, 11, 11, 0, 0, 0], [12, 7, 9, 9, 6, 8, 3, 3]]


# Bag Of Words (BoW)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names())

['bad', 'cat', 'good', 'has', 'he', 'is', 'mobile', 'not', 'phone', 'she', 'temper', 'this']




In [6]:
print(X.toarray())

[[0 0 1 0 0 1 0 0 2 0 0 1]
 [1 0 0 0 0 1 2 0 0 0 0 1]
 [0 1 2 0 0 1 0 0 0 1 0 0]
 [1 0 0 1 1 0 0 0 0 0 2 0]
 [0 0 2 0 0 1 1 1 2 0 0 1]]


### Binary BoW

In [7]:
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        if word in split:
            row_encoding.append(1)
        else:
            row_encoding.append(0)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1], [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1]]


### BoW

In [8]:
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        count = split.count(word)
        if word in split:
            row_encoding.append(count)
        else:
            row_encoding.append(count)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 1], [1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1], [0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0], [0, 0, 2, 0, 0, 1, 1, 1, 2, 0, 0, 1]]


# TF-IDF Encoding

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names())

['bad', 'cat', 'good', 'has', 'he', 'is', 'mobile', 'not', 'phone', 'she', 'temper', 'this']


In [10]:
print(X.toarray())

[[0.         0.         0.34273991 0.         0.         0.28832362
  0.         0.         0.82578944 0.         0.         0.34273991]
 [0.4023674  0.         0.         0.         0.         0.28097242
  0.80473481 0.         0.         0.         0.         0.33400129]
 [0.         0.49317635 0.6605719  0.         0.         0.27784695
  0.         0.         0.         0.49317635 0.         0.        ]
 [0.31283963 0.         0.         0.38775666 0.38775666 0.
  0.         0.         0.         0.         0.77551332 0.        ]
 [0.         0.         0.51309679 0.         0.         0.2158166
  0.30906082 0.38307292 0.61812163 0.         0.         0.2565484 ]]


# Word2Vec

In [11]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [12]:
for index, word in enumerate(wv.index_to_key):
    if index == 10:
        break
    print(f"word #{index}/{len(wv.index_to_key)} is {word}")

word #0/3000000 is </s>
word #1/3000000 is in
word #2/3000000 is for
word #3/3000000 is that
word #4/3000000 is is
word #5/3000000 is on
word #6/3000000 is ##
word #7/3000000 is The
word #8/3000000 is with
word #9/3000000 is said


In [18]:
vec_king = wv['king']
vec_king

array([ 1.25976562e-01,  2.97851562e-02,  8.60595703e-03,  1.39648438e-01,
       -2.56347656e-02, -3.61328125e-02,  1.11816406e-01, -1.98242188e-01,
        5.12695312e-02,  3.63281250e-01, -2.42187500e-01, -3.02734375e-01,
       -1.77734375e-01, -2.49023438e-02, -1.67968750e-01, -1.69921875e-01,
        3.46679688e-02,  5.21850586e-03,  4.63867188e-02,  1.28906250e-01,
        1.36718750e-01,  1.12792969e-01,  5.95703125e-02,  1.36718750e-01,
        1.01074219e-01, -1.76757812e-01, -2.51953125e-01,  5.98144531e-02,
        3.41796875e-01, -3.11279297e-02,  1.04492188e-01,  6.17675781e-02,
        1.24511719e-01,  4.00390625e-01, -3.22265625e-01,  8.39843750e-02,
        3.90625000e-02,  5.85937500e-03,  7.03125000e-02,  1.72851562e-01,
        1.38671875e-01, -2.31445312e-01,  2.83203125e-01,  1.42578125e-01,
        3.41796875e-01, -2.39257812e-02, -1.09863281e-01,  3.32031250e-02,
       -5.46875000e-02,  1.53198242e-02, -1.62109375e-01,  1.58203125e-01,
       -2.59765625e-01,  

Unfortunately, the model is unable to infer vectors for unfamiliar words. This is one limitation of Word2Vec: if this limitation matters to you, check out the FastText model.

In [14]:
try:
    vec_cameroon = wv['cameroon']
except KeyError:
    print("The word 'cameroon' does not appear in this model")

The word 'cameroon' does not appear in this model


Word2Vec supports several word similarity tasks out of the box. You can see how the similarity intuitively decreases as the words get less and less similar.

In [15]:
pairs = [
    ('car', 'minivan'),   # a minivan is a kind of car
    ('car', 'bicycle'),   # still a wheeled vehicle
    ('car', 'airplane'),  # ok, no wheels, but still a vehicle
    ('car', 'cereal'),    # ... and so on
    ('car', 'communism'),
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

'car'	'minivan'	0.69
'car'	'bicycle'	0.54
'car'	'airplane'	0.42
'car'	'cereal'	0.14
'car'	'communism'	0.06


Print the 5 most similar words to “car” or “minivan”

In [16]:
print(wv.most_similar(positive=['car', 'minivan'], topn=5))

[('SUV', 0.8532192707061768), ('vehicle', 0.8175783753395081), ('pickup_truck', 0.7763688564300537), ('Jeep', 0.7567334175109863), ('Ford_Explorer', 0.7565720081329346)]


In [17]:
# Which of the below does not belong in the sequence?
print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

car


# BERT Encoding

In [19]:
from transformers import BertTokenizer, BertModel
import torch

In [20]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"  # You can choose other variations of BERT as well
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
# Define your sentences
sentences = [
    "This is an example sentence.",
    "BERT is a powerful NLP model.",
    "Hugging Face Transformers makes BERT easy to use.",
]

In [22]:
# Tokenize and encode the sentences
encoded_sentences = []
for sentence in sentences:
    # Tokenize the sentence and add [CLS] and [SEP] tokens
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, truncation=True, padding=True)
    input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

    # Forward pass through the BERT model to get embeddings
    with torch.no_grad():
        outputs = model(input_ids)

    # Extract the embeddings from the model's output
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0)  # Average pooling over tokens

    # Convert the embeddings to a NumPy array
    embeddings = embeddings.numpy()

    encoded_sentences.append(embeddings)

Now, `encoded_sentences` contains the BERT embeddings for each sentence
Each row in `encoded_sentences` corresponds to a sentence

In [23]:
# Print the encoded sentences
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}:")
    print(sentence)
    print("BERT Encoding:")
    print(encoded_sentences[i])
    print("\n")


Sentence 1:
This is an example sentence.
BERT Encoding:
[-0.16241552 -0.4156867  -0.20488483 -0.58326024 -0.099721   -0.07208912
  0.7507061   0.87691665 -0.2966925   0.16727442 -0.00564526 -0.392152
 -0.12128234  0.32278216  0.0618793   0.32433608  0.13928136 -0.04975371
 -0.11712323 -0.05851278  0.44472548  0.49561632 -0.48746273  0.21142277
  1.0534678  -0.3005585  -0.065255   -0.4304681  -0.72952765 -0.26992595
  0.40153262  0.06429507 -0.32018387 -0.19431618  0.05775268 -0.60935026
  0.04376978 -0.338396   -0.05984513  0.44613218 -0.82230616 -0.04870924
  0.4103936   0.22968586  0.20509961 -0.11186334 -0.05072625 -0.18931001
 -0.08071747  0.10156359 -1.2761916   0.24014696  0.6649862   0.4168171
 -0.5053531   0.54594386 -0.13017234 -0.8435298  -0.3072373  -0.06074417
  0.5697198   0.2381053   0.31882563 -0.7964434   0.3805688   0.3616553
  0.24195379  0.4273143  -1.2312185   0.22904296 -0.5101602  -0.74025613
  0.07674894  0.23053274 -0.32410032  0.4414815  -0.30515683  0.43619856