<a href="https://colab.research.google.com/github/iamatul1214/NLP/blob/main/NLP_techniques_for_word_embedding_encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
## Sentences
D1 = "Dog bites man"
D2 = "Man bites dog"
D3 = "Dog eats meat"
D4 = "Man eats food"

In [3]:
preprocessed_docs = []
preprocessed_docs.append(D1)
preprocessed_docs.append(D2)
preprocessed_docs.append(D3)
preprocessed_docs.append(D4)                                                 
preprocessed_docs

['Dog bites man', 'Man bites dog', 'Dog eats meat', 'Man eats food']

In [13]:
## making all the words in lowercase
for i in range(len(preprocessed_docs)):
  preprocessed_docs[i] = preprocessed_docs[i].lower()
preprocessed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [6]:
for words in D1.split():
  print(words)

Dog
bites
man


In [7]:
words = [word for sentence in preprocessed_docs for word in sentence.split()]
vocab = list(set(words))
vocab

['man', 'Dog', 'Man', 'dog', 'eats', 'food', 'meat', 'bites']

In [8]:
## First Let's use one hot encoding for this
def get_onehot_vector(somestring):
  onehot_encoded = []
  for word in somestring.split():
    temp = [0]*len(vocab)
    if word in vocab:

      temp[vocab[word]-1] = 1
      onehot_encoded.append(temp)

  return onehot_encoded
get_onehot_vector(preprocessed_docs[1])

TypeError: ignored

## Bag of words (BoW)

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect

CountVectorizer()

In [15]:
bow_rep = count_vect.fit_transform(preprocessed_docs)
bow_rep

<4x6 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [16]:
print("Our vocabulary: ", count_vect.vocabulary_)

Our vocabulary:  {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}


In [21]:
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog:  [[1 1 0 0 1 0]]


In [25]:
#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':",temp.toarray())  ## Here dog is present twice that's why the count has increased to 2 for dog

Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


In [24]:
temp = count_vect.transform(["I love dog"])
print("Bow representation for 'dog and dog are friends':",temp.toarray())

Bow representation for 'dog and dog are friends': [[0 1 0 0 0 0]]


In [26]:
def check_representation_from_above_vocab(list_of_text):
  temp = count_vect.transform(list_of_text)
  print(f"Representation of the {list_of_text} is as --> {temp.toarray()}")
check_representation_from_above_vocab(['A dog can undestand a man and another dog who loves to eat meat and food and meat'])

Representation of the ['A dog can undestand a man and another dog who loves to eat meat and food and meat'] is as --> [[0 2 0 1 1 2]]


In [27]:
check_representation_from_above_vocab(['Man is enemny','Dog is friend','I love animals'])

Representation of the ['Man is enemny', 'Dog is friend', 'I love animals'] is as --> [[0 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 0]]


### Bag of n-grams

In [29]:
#n-gram vectorization example with count vectorizer and uni, bi,trigrams
count_vect = CountVectorizer(ngram_range=(1,3))
#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(preprocessed_docs)  ## fit transform will create the vocabulary and transform will just convert 
#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)
#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':",
temp.toarray())

Our vocabulary:  {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}
Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


## Term frequency Inverse document frequency (TF-IDF)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer()
tfid

TfidfVectorizer()

In [39]:
bow_tfid = tfid.fit_transform(preprocessed_docs)
print(f"inverse document frequencies of our vocab = {tfid.idf_}")  ## printing inverse document frequency for all the words
print(f"words of our vocab = {tfid.get_feature_names_out()}")

inverse document frequencies of our vocab = [1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]
words of our vocab = ['bites' 'dog' 'eats' 'food' 'man' 'meat']


In [40]:
temp = tfid.transform(['Dog and man are friends'])
print(temp.toarray())


[[0.         0.70710678 0.         0.         0.70710678 0.        ]]


In [43]:
def word_embedding_using_tfidf(list_of_texts):
  temp = tfid.transform(list_of_texts)
  print(f"Representation of the {list_of_texts} --> {temp.toarray()}")
word_embedding_using_tfidf(['Hello my sweet dog','How are you','Man and dog are friends'])

Representation of the ['Hello my sweet dog', 'How are you', 'Man and dog are friends'] --> [[0.         1.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.        ]
 [0.         0.70710678 0.         0.         0.70710678 0.        ]]
