In [None]:
# Text representation is to convert the text into the numbers
# the different techniques are: 
# 1. One hot encoding
# 2. Bag of words
# 3. TF-IDF
# 4. Word embeddings (Word2Vec, GloVe, FastText)

In [4]:
# 1. Bag of Words - frequency of words in the text matter, but the order of words does not matter.
# Author: Muhammad Humayun Khan

import pandas as pd
import numpy as np

# Sample DataFrame - Create a DataFrame with some text data
df = pd.DataFrame({
    'text': ["People watch Youtube", "Youtube watch People", "People write comment", "Youtube write comment"]
})

df

Unnamed: 0,text
0,People watch Youtube
1,Youtube watch People
2,People write comment
3,Youtube write comment


In [None]:
# At first the object of the CountVectorizer class is created, which will be used to convert the text data into a matrix of token counts.
# Now to get the unique words from the text data, I print the vocabulary of the CountVectorizer object.
# The vocabulary displayed all the unique words in order of the index number and index number is shown against each word.
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Print the vocabulary
print("Vocabulary:", vectorizer.vocabulary_)

# the ouput below showing comment as index 0, People as index 1, watch as index 2 and Youtube as index 3

Vocabulary: {'people': 1, 'watch': 2, 'youtube': 4, 'write': 3, 'comment': 0}


In [6]:
print("Shape of the matrix:", X.shape)

Shape of the matrix: (4, 5)


In [7]:
# now want to print the matrix in array format
print("Matrix in array format:\n", X.toarray())

Matrix in array format:
 [[0 1 1 0 1]
 [0 1 1 0 1]
 [1 1 0 1 0]
 [1 0 0 1 1]]


In [None]:
# lets predict a new text
new_text = ["People watch Youtube and write of comment"]
# Transform the new text using the same vectorizer
vectorizer.transform(new_text).toarray()

# the output below showing that the words "and", "of" are not in the vocabulary, so they are not counted.

array([[1, 1, 1, 1, 1]])