In [None]:
# Text representation is to convert the text into the numbers
# the different techniques are: 
# 1. One hot encoding
# 2. Bag of words - Unigram, Bigram, Trigram, n-gram
# 3. TF-IDF - Term Frequency-Inverse Document Frequency - used to reflect the importance/weightage of a word in a document relative to a corpus.
# 4. Word embeddings (Word2Vec, GloVe, FastText)

In [None]:
# 2. Bag of Words - frequency of words in the text matter, but the order of words does not matter.
# Uni-gram: single word
# Author: Muhammad Humayun Khan

import pandas as pd
import numpy as np

# Sample DataFrame - Create a DataFrame with some text data
df = pd.DataFrame({
    'text': ["People watch Youtube", "Youtube watch People", "People write comment", "Youtube write comment"]
})

df

Unnamed: 0,text
0,People watch Youtube
1,Youtube watch People
2,People write comment
3,Youtube write comment


In [None]:
# At first the object of the CountVectorizer class is created, which will be used to convert the text data into a matrix of token counts.
# Now to get the unique words from the text data, I print the vocabulary of the CountVectorizer object.
# The vocabulary displayed all the unique words in order of the index number and index number is shown against each word.
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the text data
X = vectorizer.fit_transform(df['text'])

# Print the vocabulary
print("Vocabulary:", vectorizer.vocabulary_)

# the ouput below showing comment as index 0, People as index 1, watch as index 2 and Youtube as index 3

Vocabulary: {'people': 1, 'watch': 2, 'youtube': 4, 'write': 3, 'comment': 0}


In [6]:
print("Shape of the matrix:", X.shape)

Shape of the matrix: (4, 5)


In [7]:
# now want to print the matrix in array format
print("Matrix in array format:\n", X.toarray())

Matrix in array format:
 [[0 1 1 0 1]
 [0 1 1 0 1]
 [1 1 0 1 0]
 [1 0 0 1 1]]


In [None]:
# lets predict a new text
new_text = ["People watch Youtube and write of comment"]
# Transform the new text using the same vectorizer
vectorizer.transform(new_text).toarray()

# the output below showing that the words "and", "of" are not in the vocabulary, so they are not counted.

array([[1, 1, 1, 1, 1]])

In [None]:
# 2. Bag of Words - bigram: two words together, trigram: three words together, n-gram: n words together
#
# Sample text data
text_data = [
    "People watch Youtube",
    "Youtube watch People",
    "People write comment",
    "Youtube write comment"
]   

# Initialize CountVectorizer with ngram_range for bigrams
vectorizer_bigram = CountVectorizer(ngram_range=(2, 2)) 

# Fit and transform the text data for bigrams
X_bigram = vectorizer_bigram.fit_transform(text_data)

# Print the vocabulary for bigrams
print("Bigram Vocabulary:", vectorizer_bigram.vocabulary_)

Bigram Vocabulary: {'people watch': 0, 'watch youtube': 3, 'youtube watch': 5, 'watch people': 2, 'people write': 1, 'write comment': 4, 'youtube write': 6}


In [11]:
# Trigram example
vectorizer_trigram = CountVectorizer(ngram_range=(3, 3))

# Fit and transform the text data for trigrams
X_trigram = vectorizer_trigram.fit_transform(text_data)

# Print the vocabulary for trigrams
print("Trigram Vocabulary:", vectorizer_trigram.vocabulary_)


Trigram Vocabulary: {'people watch youtube': 0, 'youtube watch people': 2, 'people write comment': 1, 'youtube write comment': 3}


In [12]:
# n-gram example
vectorizer_ngram = CountVectorizer(ngram_range=(1, 2))

# Fit and transform the text data for n-grams
X_ngram = vectorizer_ngram.fit_transform(text_data)

# Print the vocabulary for n-grams
print("N-gram Vocabulary:", vectorizer_ngram.vocabulary_)

N-gram Vocabulary: {'people': 1, 'watch': 4, 'youtube': 9, 'people watch': 2, 'watch youtube': 6, 'youtube watch': 10, 'watch people': 5, 'write': 7, 'comment': 0, 'people write': 3, 'write comment': 8, 'youtube write': 11}


In [None]:
# 3. TF-IDF - Term Frequency-Inverse Document Frequency

# Sample text data
text_data = [
    "People watch Youtube",
    "Youtube watch People",
    "People write comment",
    "Youtube write comment"
]

from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(text_data).toarray()

# Print the Term Frequencey Matrix for each word
print("TF-IDF Matrix:\n", X_tfidf)  

TF-IDF Matrix:
 [[0.         0.53256952 0.65782931 0.         0.53256952]
 [0.         0.53256952 0.65782931 0.         0.53256952]
 [0.61366674 0.49681612 0.         0.61366674 0.        ]
 [0.61366674 0.         0.         0.61366674 0.49681612]]


In [None]:
# print the IDF and feature names
print("IDF values:", tfidf_vectorizer.idf_)
print("Feature names:", tfidf_vectorizer.get_feature_names_out())

IDF values: [1.51082562 1.22314355 1.51082562 1.51082562 1.22314355]
Feature names: ['comment' 'people' 'watch' 'write' 'youtube']


In [None]:
# Now create the TF-IDF matrix for the same text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Print the shape of the TF-IDF matrix
print("Shape of the TF-IDF matrix:", tfidf_matrix.shape)

