# Feature Transformation

In [None]:
documents = ["Dog bites man.", "Man bites dog", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".", "") for doc in documents]
processed_docs

In [None]:
# build vocabulary for indexing
vocab = {}
count = 0
for doc in processed_docs:
    for word in doc.split():
        if word not in vocab:
            count += 1
            vocab[word] = count

print(vocab)

## One Hot Encoding

In [None]:
def get_onehot_vector(somestring):
    onehot_encoded = []
    for word in somestring.split():
        temp = [0]*len(vocab) # create array contains 0 in length of vocab
        if word in vocab:
            temp[vocab[word]-1] = 1 # -1 is to take care of the fact indexing in array starts from 0 and not 1
        onehot_encoded.append(temp)
    return onehot_encoded

In [None]:
print(processed_docs[1])
get_onehot_vector(processed_docs[1])

In [None]:
get_onehot_vector("man and dog are good")

In [None]:
data = []

for sentence in processed_docs:
    data.append(sentence.split())

print(data)

values = []

for sentence in processed_docs:
    temp = sentence.split()
    values = values + temp

print(values)

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
print("Label Encoded : ", integer_encoded)

one_hot_encoder = OneHotEncoder()
one_hot_encoded = one_hot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n", one_hot_encoded)

## Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

print("Data :", processed_docs)

count_vect = CountVectorizer()
bow_rep = count_vect.fit_transform(processed_docs)

print("Vocabulary :", count_vect.vocabulary_)

print("BoW for : " + processed_docs[0] + " : ", bow_rep[0].toarray())
print("BoW for : " + processed_docs[1] + " : ", bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

## Bag of N-Grams

In [None]:
count_vect = CountVectorizer(ngram_range=(2,2))

bow_rep = count_vect.fit_transform(processed_docs)

print("Vocabulary :", count_vect.vocabulary_)

In [None]:
print('BoW representation for : ' + processed_docs[0] + " : ", bow_rep[0].toarray())
print('BoW representation for : ' + processed_docs[1] + " : ", bow_rep[1].toarray())

In [None]:
count_vect.vocabulary_

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bow_rep_tfidf = tfidf.fit_transform(processed_docs)

print("IDF for all words in the vocabulary", tfidf.idf_)

In [None]:
print("All words in the vocabulary", tfidf.get_feature_names_out())

In [None]:
processed_docs

In [None]:
print("TFIDF representation for all documents in the corpus\n", bow_rep_tfidf.toarray())

In [None]:
temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())