## **One Hot Encoding**

In [5]:
def get_one_hot_encoding(text):
    words = text.split()
    vocab = set(words)
    encodings = []
    vocab_to_index = {word: i for i, word in enumerate(vocab)}
    for word in words:
        one_hot_encoding = [0] * len(vocab)
        one_hot_encoding[vocab_to_index[word]] = 1
        encodings.append(one_hot_encoding)
    return encodings

In [7]:
example_text = "cat in the hat dog on the mat bird in the tree"
one_hot_encodings = get_one_hot_encoding(example_text)

for word, encoding in zip(example_text.split(), one_hot_encodings):
    print(f"{word}: {encoding}")

cat: [0, 0, 1, 0, 0, 0, 0, 0, 0]
in: [0, 0, 0, 0, 1, 0, 0, 0, 0]
the: [1, 0, 0, 0, 0, 0, 0, 0, 0]
hat: [0, 0, 0, 0, 0, 0, 0, 1, 0]
dog: [0, 1, 0, 0, 0, 0, 0, 0, 0]
on: [0, 0, 0, 1, 0, 0, 0, 0, 0]
the: [1, 0, 0, 0, 0, 0, 0, 0, 0]
mat: [0, 0, 0, 0, 0, 1, 0, 0, 0]
bird: [0, 0, 0, 0, 0, 0, 0, 0, 1]
in: [0, 0, 0, 0, 1, 0, 0, 0, 0]
the: [1, 0, 0, 0, 0, 0, 0, 0, 0]
tree: [0, 0, 0, 0, 0, 0, 1, 0, 0]


## **Bag of Word (Bow)**

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

documents = ["This is the first document.",
              "This document is the second document.",
              "And this is the third one.",
              "Is this the first document?"]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

print("Bag-of-Words Matrix:")
print(X.toarray())
print("Vocabulary (Feature Names):", feature_names)

Bag-of-Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]
Vocabulary (Feature Names): ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


## **TF-IDF**

In [1]:
   
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "The quick brown fox jumps over the lazy dog.",
    "A journey of a thousand miles begins with a single step.",
]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

tfidf_values = {}

# for doc_index, doc in enumerate(documents):
    

In [2]:
print(tfidf_matrix)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16 stored elements and shape (2, 16)>
  Coords	Values
  (0, 13)	0.6030226891555273
  (0, 10)	0.30151134457776363
  (0, 1)	0.30151134457776363
  (0, 3)	0.30151134457776363
  (0, 5)	0.30151134457776363
  (0, 9)	0.30151134457776363
  (0, 6)	0.30151134457776363
  (0, 2)	0.30151134457776363
  (1, 4)	0.3535533905932738
  (1, 8)	0.3535533905932738
  (1, 14)	0.3535533905932738
  (1, 7)	0.3535533905932738
  (1, 0)	0.3535533905932738
  (1, 15)	0.3535533905932738
  (1, 11)	0.3535533905932738
  (1, 12)	0.3535533905932738


In [5]:
for doc_index, doc in enumerate(documents):
    feature_index = tfidf_matrix[doc_index, :].nonzero()[1]
    tfidf_doc_values = zip(feature_index, [tfidf_matrix[doc_index, x] for x in feature_index])
    tfidf_values[doc_index] = {feature_names[i]: value for i, value in tfidf_doc_values}


for doc_index, values in tfidf_values.items():
    print(f"Document {doc_index + 1}:")
    for word, tfidf_value in values.items():
        print(f"{word}: {tfidf_value}")
    print("\n")

Document 1:
the: 0.6030226891555273
quick: 0.30151134457776363
brown: 0.30151134457776363
fox: 0.30151134457776363
jumps: 0.30151134457776363
over: 0.30151134457776363
lazy: 0.30151134457776363
dog: 0.30151134457776363


Document 2:
journey: 0.3535533905932738
of: 0.3535533905932738
thousand: 0.3535533905932738
miles: 0.3535533905932738
begins: 0.3535533905932738
with: 0.3535533905932738
single: 0.3535533905932738
step: 0.3535533905932738


