In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy

text_data = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

def one_hot_encoding(text_data):
    unique_words = set(" ".join(text_data).split())
    encoded_data = []
    for text in text_data:
        encoded_text = [1 if word in text else 0 for word in unique_words]
        encoded_data.append(encoded_text)
    return np.array(encoded_data)

one_hot_encoded = one_hot_encoding(text_data)
print("One Hot Encoding:")
print(one_hot_encoded)

vectorizer = CountVectorizer()
bow_features = vectorizer.fit_transform(text_data)
print("\nBag of Words (BOW):")
print(bow_features.toarray())

ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_features = ngram_vectorizer.fit_transform(text_data)
print("\nn-grams:")
print(ngram_features.toarray())

tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(text_data)
print("\nTf-Idf:")
print(tfidf_features.toarray())

custom_features = np.array([[len(doc)] for doc in text_data])
print("\nCustom Features:")
print(custom_features)

# Load spaCy small English model
import spacy
nlp = spacy.load('en_core_web_sm')

# Use SpaCy word vectors (averaged)
word2vec_features = []
for doc in text_data:
    doc_vector = nlp(doc).vector
    word2vec_features.append(doc_vector)

word2vec_features = np.array(word2vec_features)
print("\nWord Embedding Features (using spaCy):")
print(word2vec_features)


One Hot Encoding:
[[1 0 0 0 1 1 1 0 1 0 0 1 0]
 [1 0 0 0 0 1 1 1 1 0 0 1 0]
 [0 1 1 0 0 0 0 0 1 1 0 1 1]
 [1 0 0 1 1 0 0 0 1 0 1 1 1]]

Bag of Words (BOW):
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

n-grams:
[[0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0]
 [0 0 2 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0]
 [1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0]
 [0 0 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1]]

Tf-Idf:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Custom Features:
[[27]
 [37]
 [26]
 [27]]

Word Embedding Features (using spaCy):
[[ 4.11296897e-02 -1.50990739e-01  3.46765928e-02  7.88882375e-02
  -2.0491