# One-Hot encoding

In [46]:
import numpy as np
# example1 = 'I  will eat the Pizza' # Sample set for our example
example1 = 'I ate an apple and played the piano'
vocab = []
counter = 0
                           
for considered_word in example1.lower().split():
    if considered_word not in vocab:
        vocab.append((considered_word, counter+1))
        counter = counter + 1

In [47]:
print(sorted(vocab))

[('an', 3), ('and', 5), ('apple', 4), ('ate', 2), ('i', 1), ('piano', 8), ('played', 6), ('the', 7)]


In [48]:
vocab = sorted(vocab)
list = []
for i in range(len(vocab)):
    list.append(vocab[i][1])
list

[3, 5, 4, 2, 1, 8, 6, 7]

In [49]:
indices_array = np.array(list)
indices_array

array([3, 5, 4, 2, 1, 8, 6, 7])

In [50]:
result = np.zeros((indices_array.size, indices_array.max() + 1))
result[np.arange(indices_array.size), indices_array] = 1
result

array([[0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0.]])

## Bag Of Words

In [1]:
####### Using CountVectorizer() #######

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

sentence1 = "Welcome to Great Learning , Now start learning"
sentence2 = "Learning is a good practice"

CountVec = CountVectorizer(stop_words='english')

BagOfWords = CountVec.fit_transform([sentence1,sentence2])
 
#create dataframe
BagOfWords_DF = pd.DataFrame(BagOfWords.toarray(),columns=CountVec.get_feature_names_out())
print(BagOfWords_DF)

   good  great  learning  practice  start  welcome
0     0      1         2         0      1        1
1     1      0         1         1      0        0


In [14]:
####### Using CountVectorizer() and gensim library #######
    ###    gensim is used to remove_stopwords    ###

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from gensim.parsing.preprocessing import remove_stopwords


text = ["Welcome to Great Learning, Now start learning","Learning is good practice"]

# Step - 1: Convert to lower case
text = [i.lower() for i in text]


# Step - 2: Stopword removal
filtered_sentence = [remove_stopwords(t) for t in text]
print(filtered_sentence)


# Step - 3: Sentence Scoring
vectorizer = CountVectorizer()

# Create sample set of documents
docs = np.array(filtered_sentence)

# Fit the bag-of-words model
bag = vectorizer.fit_transform(docs)

# Get unique words / tokens found in all the documents. The unique words / tokens represents the features
print(vectorizer.get_feature_names_out())

# Associate the indices with each unique word
print(vectorizer.vocabulary_)

# Print the numerical feature vector
print(bag.toarray())

['welcome great learning, start learning', 'learning good practice']
['good' 'great' 'learning' 'practice' 'start' 'welcome']
{'welcome': 5, 'great': 1, 'learning': 2, 'start': 4, 'good': 0, 'practice': 3}
[[0 1 2 0 1 1]
 [1 0 1 1 0 0]]


In [24]:
####### From Scratch (or) Without using CountVectorizer() #######

def vectorize(tokens):
    vector=[]
    for w in filtered_vocab:
        vector.append(tokens.count(w))
    return vector
def unique(sequence):
    seen = set()
    unique = []
    for x in sequence:
        if not x in seen:
            seen.add(x)
            unique.append(x)
    return unique

stopwords=["to","is","a"] # Stop-words given manually
special_char=[",",":"," ",";",".","?"] # list of special characters

sentence1 = "Welcome to Great Learning , Now start learning"
sentence2 = "Learning is a good practice"

#convert them to lower case
sentence1 = sentence1.lower()
sentence2 = sentence2.lower()

#split the sentences into tokens
tokens1 = sentence1.split()
tokens2 = sentence2.split()

print("Tokens for Sentence-1: ",tokens1)
print("Tokens for Sentence-2: ",tokens2)

#create a vocabulary list
vocab=unique(tokens1+tokens2)
print('Vocabulary(Unique Words): ',vocab)

#filter the vocabulary list
filtered_vocab=[]
for w in vocab: 
    if w not in stopwords and w not in special_char: 
        filtered_vocab.append(w)
        
print('After Stop-word removal: ', filtered_vocab)
#convert sentences into vectords
vector1=vectorize(tokens1)
vector2=vectorize(tokens2)
print("\nBag-of-Words: ")
print(vector1)
print(vector2)

Tokens for Sentence-1:  ['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'learning']
Tokens for Sentence-2:  ['learning', 'is', 'a', 'good', 'practice']
Vocabulary(Unique Words):  ['welcome', 'to', 'great', 'learning', ',', 'now', 'start', 'is', 'a', 'good', 'practice']
After Stop-word removal:  ['welcome', 'great', 'learning', 'now', 'start', 'good', 'practice']

Bag-of-Words: 
[1, 1, 2, 1, 1, 0, 0]
[0, 0, 1, 0, 0, 1, 1]


# TF-IDF

In [45]:
##### TF-IDF using TfidfVectorizer #####

from sklearn.feature_extraction.text import  CountVectorizer, TfidfVectorizer

docs = ['data science is one of the most important fields of science',
        'this is one of the best data science courses',
        'data scientists analyze data' ]

cv = CountVectorizer()
word_count_vector = cv.fit_transform(docs)
tf = pd.DataFrame(word_count_vector.toarray(), columns=cv.get_feature_names_out())
print('TF Scores:\n',tf)

vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(docs)
vocab = vectorizer.vocabulary_ # Obtain the vocabulary and feature names
idf = vectorizer.idf_ # Obtain the IDF scores

tfidf_scores = tfidf.toarray()
tfidf_scores = pd.DataFrame(tfidf_scores,columns=vectorizer.get_feature_names_out()) # Obtain the TF-IDF scores

print('\nVocabulary:\n', vocab)
print('\nIDF Scores: ', idf)
print('\nTF-IDF Scores: ', tfidf_scores)

TF Scores:
    analyze  best  courses  data  fields  important  is  most  of  one  \
0        0     0        0     1       1          1   1     1   2    1   
1        0     1        1     1       0          0   1     0   1    1   
2        1     0        0     2       0          0   0     0   0    0   

   science  scientists  the  this  
0        2           0    1     0  
1        1           0    1     1  
2        0           1    0     0  

Vocabulary:
 {'data': 3, 'science': 10, 'is': 6, 'one': 9, 'of': 8, 'the': 12, 'most': 7, 'important': 5, 'fields': 4, 'this': 13, 'best': 1, 'courses': 2, 'scientists': 11, 'analyze': 0}

IDF Scores:  [1.69314718 1.69314718 1.69314718 1.         1.69314718 1.69314718
 1.28768207 1.69314718 1.28768207 1.28768207 1.28768207 1.69314718
 1.28768207 1.69314718]

TF-IDF Scores:      analyze      best   courses      data    fields  important        is  \
0  0.000000  0.000000  0.000000  0.189526  0.320895   0.320895  0.244049   
1  0.000000  0.400294