# TEXT REPRESENTATION - NLP LECTURE 4
## Bag of Words | Tf-Idf | N-grams, Bi-grams and Uni-grams | OHE

In [1]:
import pandas as pd
import numpy as np 

In [2]:
# Importing the pandas module
import pandas as pd

# Creating a DataFrame with 'text' and 'output' columns
df = pd.DataFrame({'text': ['people watch campusx',
                            'campusx watch campusx',
                            'people write comment',
                            'campusx write comment'],
                   'output': [1, 1, 0, 0]})

In [3]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


## Bag of Words

### Using :  sklearn - CountVectorizer



In [4]:
# Importing the CountVectorizer module
from sklearn.feature_extraction.text import CountVectorizer

# Initializing the CountVectorizer object with desired parameters
cv = CountVectorizer(
    lowercase=True,  # Convert all characters to lowercase before tokenizing
    binary=False,    # Set binary=True for problems like sentiment analysis
    max_features=None  # If not None, build a vocabulary that only considers 
                       # the top max_features ordered by term frequency across
                       # the corpus. Otherwise, all features are used.
)


In [5]:
# Transforming the 'text' column of the DataFrame using the CountVectorizer
bow = cv.fit_transform(df['text'])

In [6]:
# Printing the vocabulary

# The 'cv.vocabulary_' attribute contains a dictionary where the keys are the unique words 
# in the documents and the values are the corresponding indices assigned to each word.
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [7]:
# Converting sparse matrix to array

# The 'toarray()' method is used to convert the sparse matrix 'bow[0]' 
# to a dense array representation.

print(bow[0].toarray())
print(bow[1].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]


#### out of vocab problem is handled here 

see how and,of, and other words are handled below 
they were absent during the training time # Handling Out-of-Vocabulary (OOV) problem When using CountVectorizer, the vocabulary is built based on the words present in the training data. 
If there are words in the test data that were not present during training,they will not be included in the vocabulary.

Let's see how the words "other" and "book" are handled below:
Since these words were absent during the training time, they will not be present in the vocabulary.

In [8]:
print(cv.transform(["write other book "]).toarray())

[[0 0 0 0 1]]


## n-grams

In [9]:
# Importing the required library
from sklearn.feature_extraction.text import CountVectorizer

# Initializing the CountVectorizer object with desired parameters
cv = CountVectorizer(
    ngram_range=(2, 2)  # Using bigrams (ngram_range=(2, 2))
)

# Applying CountVectorizer to the 'text' column in the DataFrame 'df'
bigram = cv.fit_transform(df['text'])

# Printing the vocabulary
print(cv.vocabulary_)

{'people watch': 2, 'watch campusx': 4, 'campusx watch': 0, 'people write': 3, 'write comment': 5, 'campusx write': 1}


### Using n-grams of size 1 to 3 (unigrams, bigrams, and trigrams) 

In [10]:
# Importing the required library
from sklearn.feature_extraction.text import CountVectorizer

# Initializing the CountVectorizer object with desired parameters
cv = CountVectorizer(
    ngram_range=(1, 3)  # Using n-grams of size 1 to 3 (unigrams, bigrams, and trigrams)
)

# Applying CountVectorizer to the 'text' column in the DataFrame 'df'
ngram = cv.fit_transform(df['text'])

# Printing the vocabulary (unique words and n-grams in the text corpus)
print(cv.vocabulary_)
print(len(cv.vocabulary_))

{'people': 6, 'watch': 11, 'campusx': 0, 'people watch': 7, 'watch campusx': 12, 'people watch campusx': 8, 'campusx watch': 1, 'campusx watch campusx': 2, 'write': 13, 'comment': 5, 'people write': 9, 'write comment': 14, 'people write comment': 10, 'campusx write': 3, 'campusx write comment': 4}
15


## Tf-Idf

In [45]:
# Importing the required library
from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing the TfidfVectorizer object
tfidf = TfidfVectorizer()

# Fitting and transforming the 'text' column in the DataFrame 'df'  
# into TF-IDF vectors
tfidf_matrix = tfidf.fit_transform(df['text']).toarray()

print(tfidf_matrix)

[[0.49681612 0.         0.61366674 0.61366674 0.        ]
 [0.8508161  0.         0.         0.52546357 0.        ]
 [0.         0.57735027 0.57735027 0.         0.57735027]
 [0.49681612 0.61366674 0.         0.         0.61366674]]


In [46]:
# Printing the inverse document frequency (IDF) of each feature in the 
# TfidfVectorizer
print(tfidf.idf_)

# Printing the feature names (words) in the TfidfVectorizer
print(tfidf.get_feature_names_out())

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]
['campusx' 'comment' 'people' 'watch' 'write']
