In [None]:
document_corpus = ["this is good phone phone" ,
                   "this is bad mobile mobile" ,
                   "she is good good cat" ,
                   "he has bad temper temper" ,
                   "this mobile phone phone is not good good"]

In [None]:
data_corpus = set()
for row in document_corpus:
    for word in row.split(" "):
        if word not in data_corpus:
            data_corpus.add(word)

data_corpus=sorted(data_corpus)

print(data_corpus)

['bad', 'cat', 'good', 'has', 'he', 'is', 'mobile', 'not', 'phone', 'she', 'temper', 'this']


## Index Based Encoding :

In [None]:
res = len(max(document_corpus, key = len).split(" "))
print(res)

8


In [None]:
index_based_encoding=[]
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for i in range(res):
        if i <= len(split)-1:
            row_encoding.append(data_corpus.index(split[i])+1)
        else:
            row_encoding.append(0)
    index_based_encoding.append(row_encoding)

print(index_based_encoding)

[[12, 6, 3, 9, 9, 0, 0, 0], [12, 6, 1, 7, 7, 0, 0, 0], [10, 6, 3, 3, 2, 0, 0, 0], [5, 4, 1, 11, 11, 0, 0, 0], [12, 7, 9, 9, 6, 8, 3, 3]]


## Bag Of Words (BoW) :

### 1. Binary BoW

In [None]:
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        if word in split:
            row_encoding.append(1)
        else:
            row_encoding.append(0)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1], [1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1], [0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0], [0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1]]


### 2. BoW

In [None]:
one_hot_encoding = []
for row in document_corpus:
    row_encoding = []
    split = row.split(" ")
    for word in data_corpus:
        count = split.count(word)
        if word in split:
            row_encoding.append(count)
        else:
            row_encoding.append(count)
    one_hot_encoding.append(row_encoding)

print(one_hot_encoding)

[[0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 0, 1], [1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 1], [0, 1, 2, 0, 0, 1, 0, 0, 0, 1, 0, 0], [1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 2, 0], [0, 0, 2, 0, 0, 1, 1, 1, 2, 0, 0, 1]]


## TF-IDF Encoding :

In [None]:
tf_dict = {}
i=0
for row in document_corpus:
    row_dict={}
    split = row.split(" ")
    for word in split:
        if word not in row_dict.keys():
            row_dict[word] = split.count(word)
    tf_dict[i] = row_dict
    i+=1

print(tf_dict)

{0: {'this': 1, 'is': 1, 'good': 1, 'phone': 2}, 1: {'this': 1, 'is': 1, 'bad': 1, 'mobile': 2}, 2: {'she': 1, 'is': 1, 'good': 2, 'cat': 1}, 3: {'he': 1, 'has': 1, 'bad': 1, 'temper': 2}, 4: {'this': 1, 'mobile': 1, 'phone': 2, 'is': 1, 'not': 1, 'good': 2}}


In [None]:
import math
def calculate_tf(word, sentence_num):
    row_dict = tf_dict[int(sentence_num)]
    return row_dict[word]/sum(row_dict.values())

def calculate_idf(word):
    doc_num = 0
    for key, value in tf_dict.items():
        if word in value.keys():
            doc_num+=1
    return math.log(len(data_corpus)/doc_num+1)

def tf_idf(word, sentence_num):
    return round(calculate_tf(word, sentence_num) * calculate_idf(word),5)

In [None]:
tf_idf('phone',0)

0.77836

In [None]:
tf_idf_encoding = []
for i in range(len(document_corpus)):
    row = document_corpus[i]
    split = row.split(" ")
    row_encoding = []
    for word in data_corpus:
        if word in split:
            row_encoding.append(tf_idf(word,i))
        else:
            row_encoding.append(0)
    tf_idf_encoding.append(row_encoding)

print(tf_idf_encoding)

[[0, 0, 0.32189, 0, 0, 0.27726, 0, 0, 0.77836, 0, 0, 0.32189], [0.38918, 0, 0, 0, 0, 0.27726, 0.77836, 0, 0, 0, 0, 0.32189], [0, 0.51299, 0.64378, 0, 0, 0.27726, 0, 0, 0, 0.51299, 0, 0], [0.38918, 0, 0, 0.51299, 0.51299, 0, 0, 0, 0, 0, 1.02598, 0], [0, 0, 0.40236, 0, 0, 0.17329, 0.24324, 0.32062, 0.48648, 0, 0, 0.20118]]


## Scikit-Learn Implementation:

### BoW Encoding

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names_out())

['bad' 'cat' 'good' 'has' 'he' 'is' 'mobile' 'not' 'phone' 'she' 'temper'
 'this']


In [None]:
print(X.toarray())

[[0 0 1 0 0 1 0 0 2 0 0 1]
 [1 0 0 0 0 1 2 0 0 0 0 1]
 [0 1 2 0 0 1 0 0 0 1 0 0]
 [1 0 0 1 1 0 0 0 0 0 2 0]
 [0 0 2 0 0 1 1 1 2 0 0 1]]


### TF-IDF Encoding

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(document_corpus)
print(vectorizer.get_feature_names_out())

['bad' 'cat' 'good' 'has' 'he' 'is' 'mobile' 'not' 'phone' 'she' 'temper'
 'this']


In [None]:
print(X.toarray())

[[0.         0.         0.34273991 0.         0.         0.28832362
  0.         0.         0.82578944 0.         0.         0.34273991]
 [0.4023674  0.         0.         0.         0.         0.28097242
  0.80473481 0.         0.         0.         0.         0.33400129]
 [0.         0.49317635 0.6605719  0.         0.         0.27784695
  0.         0.         0.         0.49317635 0.         0.        ]
 [0.31283963 0.         0.         0.38775666 0.38775666 0.
  0.         0.         0.         0.         0.77551332 0.        ]
 [0.         0.         0.51309679 0.         0.         0.2158166
  0.30906082 0.38307292 0.61812163 0.         0.         0.2565484 ]]
