In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
documents = ["Dog bites man.","Man bites dog.","Dog eats meat.","Man eats food"]

processed_docs = [doc.lower().replace(".","") for doc in documents]

processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [3]:
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count = count + 1
            vocab[word] = count

vocab

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}

In [4]:
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab)
        if word in vocab:
            temp[vocab[word] - 1] = 1
        onehot_encoded.append(temp)
    return onehot_encoded

In [5]:
print(processed_docs[1])

get_onehot_vector(processed_docs[1])

man bites dog


[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]

In [6]:
get_onehot_vector("man and dog are good")

[[0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

In [7]:
s1 = "dog bites man"
s2 = "man bites dog"
s3 = "dog eats meat"
s4 = "man eats food"

from sklearn.preprocessing import LabelEncoder,OneHotEncoder

data = [s1.split(),s2.split(),s3.split(),s4.split()]
values = data[0] + data[1] + data[2] + data[3]
print("Data : ",values)

# Label Encoding
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print('\nLabel Encoded : ',integer_encoded)

# Onehot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoder = onehot_encoder.fit_transform(data).toarray()
print('\nOne Hot Encoder Matrix : \n',onehot_encoder)

Data :  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']

Label Encoded :  [1 0 4 4 0 1 1 2 5 4 2 3]

One Hot Encoder Matrix : 
 [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


In [8]:
# Get the vocabulary from the LabelEncoder
vocabulary = label_encoder.classes_

# Define the unknown and absent columns
unknown_column = onehot_encoder[:, -2]
absent_column = onehot_encoder[:, -1]

# Display one-hot encoded values for each word in the vocabulary
for word, one_hot_encoded_value, unknown, absent in zip(vocabulary, onehot_encoder.T[:-2], unknown_column, absent_column):
    print(f"Word: {word}, One-Hot Encoded Value: {one_hot_encoded_value}, Unknown: {unknown}, Absent: {absent}")


Word: bites, One-Hot Encoded Value: [1. 0. 1. 0.], Unknown: 1.0, Absent: 0.0
Word: dog, One-Hot Encoded Value: [0. 1. 0. 1.], Unknown: 0.0, Absent: 0.0
Word: eats, One-Hot Encoded Value: [1. 1. 0. 0.], Unknown: 0.0, Absent: 1.0
Word: food, One-Hot Encoded Value: [0. 0. 1. 1.], Unknown: 0.0, Absent: 0.0
