# Feature Transformation

In [2]:
documents = ["Dog bites man.", "Man bites dog", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".", "") for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [3]:
# build vocabulary for indexing
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count += 1
            vocab[word] = count

print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


## One Hot Encoding

In [4]:
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab) # create array contains 0 in length of vocab
        if word in vocab:
            temp[vocab[word]-1] = 1 # -1 is to take care of the fact indexing in array starts from 0 and not 1
        onehot_encoded.append(temp)
    return onehot_encoded

In [5]:
print(processed_docs[1])
get_onehot_vector(processed_docs[1])

man bites dog


[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]

In [6]:
get_onehot_vector("man and dog are good")

[[0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [1, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0],
 [0, 0, 0, 0, 0, 0]]

In [10]:
data = []

for sentence in processed_docs:
    data.append(sentence.split())

print(data)

values = []

for sentence in processed_docs:
    temp = sentence.split()
    values = values + temp

print(values)

[['dog', 'bites', 'man'], ['man', 'bites', 'dog'], ['dog', 'eats', 'meat'], ['man', 'eats', 'food']]
['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']


In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded : ", integer_encoded)

one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n", one_hot_encoded)

Label Encoded :  [1 0 4 4 0 1 1 2 5 4 2 3]
Onehot Encoded Matrix:
 [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


## Bag of Words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

print("Data :", processed_docs)

count_vect = CountVectorizer()
bow_rep = count_vect.fit_transform(processed_docs)

print("Vocabulary :", count_vect.vocabulary_)

print("BoW for : " + processed_docs[0] + " : ", bow_rep[0].toarray())
print("BoW for : " + processed_docs[1] + " : ", bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Data : ['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']
Vocabulary : {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
BoW for : dog bites man :  [[1 1 0 0 1 0]]
BoW for : man bites dog :  [[1 1 0 0 1 0]]
Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


## Bag of N-Grams

In [27]:
count_vect = CountVectorizer(ngram_range=(2,2))

bow_rep = count_vect.fit_transform(processed_docs)

print("Vocabulary :", count_vect.vocabulary_)

Vocabulary : {'dog bites': 2, 'bites man': 1, 'man bites': 6, 'bites dog': 0, 'dog eats': 3, 'eats meat': 5, 'man eats': 7, 'eats food': 4}


In [28]:
print('BoW representation for : ' + processed_docs[0] + " : ", bow_rep[0].toarray())
print('BoW representation for : ' + processed_docs[1] + " : ", bow_rep[1].toarray())

BoW representation for : dog bites man :  [[0 1 1 0 0 0 0 0]]
BoW representation for : man bites dog :  [[1 0 0 0 0 0 1 0]]


In [29]:
count_vect.vocabulary_

{'dog bites': 2,
 'bites man': 1,
 'man bites': 6,
 'bites dog': 0,
 'dog eats': 3,
 'eats meat': 5,
 'man eats': 7,
 'eats food': 4}

## TF-IDF

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)

print("IDF for all words in the vocabulary", tfidf.idf_)

IDF for all words in the vocabulary [1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]


In [34]:
print("All words in the vocabulary", tfidf.get_feature_names_out())

All words in the vocabulary ['bites' 'dog' 'eats' 'food' 'man' 'meat']


In [35]:
print("TFIDF representation for all documents in the corpus\n", bow_rep_tfidf.toarray())

TFIDF representation for all documents in the corpus
 [[0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.         0.44809973 0.55349232 0.         0.         0.70203482]
 [0.         0.         0.55349232 0.70203482 0.44809973 0.        ]]


In [36]:
temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

Tfidf representation for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]
