In [1]:
# 10 December 2023
# CSC461 – Assignment4 – NLP
# Komal Khizar
# FA20-BSE-096
# Q1. [CLO-2]
# The task involves computing several text analysis metrics for the given sentences (S1, S2, S3): Bag of Words (BoW), Term Frequency (TF), Inverse Document Frequency (IDF), and the TF-IDF values for each term.

# BoW: Counts the occurrence of each word in the sentences.
# TF: Measures the frequency of a word in a single document.
# IDF: Evaluates how important a word is across multiple documents.
# TF-IDF: Combines TF and IDF to reflect the importance of words in a document relative to a collection of documents.
# We'll calculate these values for each term in the given sentences.



#import warnings to 'ignore' warning messages

import warnings
warnings.filterwarnings("ignore")

In [9]:
#import important libraries

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import pandas as pd
import numpy as np

In [3]:
S1 = "data science is one of the most important courses in computer science"
S2 = "this is one of the best data science courses"
S3 = "the data scientists perform data analysis"

In [4]:
#combine both texts to make a corpus

data = [S1, S2, S3]
data

['data science is one of the most important courses in computer science',
 'this is one of the best data science courses',
 'the data scientists perform data analysis']

In [5]:
#create a reference object to the CountVectorizer constructor
#use the reference object to generate BoW matrix using the fit_transform() function

count_vectorizer = CountVectorizer()
c_vector_matrix = count_vectorizer.fit_transform(data)
c_vector_matrix

<3x16 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [6]:
#use get_feature_names_out() to extract vocabulary

c_tokens = count_vectorizer.get_feature_names_out()
c_tokens

array(['analysis', 'best', 'computer', 'courses', 'data', 'important',
       'in', 'is', 'most', 'of', 'one', 'perform', 'science',
       'scientists', 'the', 'this'], dtype=object)

In [7]:
#convert BoW matrix to an array format

c_vector_matrix.toarray()

array([[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 2, 0, 1, 0],
       [0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1],
       [1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0]])

In [8]:
#convert BoW matrix to a Pandas dataframe

df_c = pd.DataFrame(data = c_vector_matrix.toarray(), columns = c_tokens)
df_c

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0,0,1,1,1,1,1,1,1,1,1,0,2,0,1,0
1,0,1,0,1,1,0,0,1,0,1,1,0,1,0,1,1
2,1,0,0,0,2,0,0,0,0,0,0,1,0,1,1,0


In [10]:
# Computing Term Frequency
n_sentances = len(data)         #·Number of sentances in the corpus
n_words_set = len(c_tokens) #·Number of unique words in the words set

df_tf = pd.DataFrame(np.zeros((n_sentances, n_words_set)), columns=c_tokens)

# Compute Term Frequency (TF)
for i in range(n_sentances):
    words = data[i].split(' ') # Words in the sentance
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))

df_tf

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0.0,0.0,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.0,0.166667,0.0,0.083333,0.0
1,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.111111,0.0,0.111111,0.111111
2,0.166667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,0.166667,0.0


In [11]:
# Computing Inverse Document Frequency
print("IDF of: ")

idf = {}

for w in c_tokens:
    k = 0    # number of documents in the corpus that contain this word

    for i in range(n_sentances):
        if w in data[i].split():
            k += 1

    idf[w] =  np.log10(n_sentances / k)

    print(f'{w:>15}: {idf[w]:>10}' )

IDF of: 
       analysis: 0.47712125471966244
           best: 0.47712125471966244
       computer: 0.47712125471966244
        courses: 0.17609125905568124
           data:        0.0
      important: 0.47712125471966244
             in: 0.47712125471966244
             is: 0.17609125905568124
           most: 0.47712125471966244
             of: 0.17609125905568124
            one: 0.17609125905568124
        perform: 0.47712125471966244
        science: 0.17609125905568124
     scientists: 0.47712125471966244
            the:        0.0
           this: 0.47712125471966244


In [12]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in c_tokens:
    for i in range(n_sentances):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

df_tf_idf

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0.0,0.0,0.03976,0.014674,0.0,0.03976,0.03976,0.014674,0.03976,0.014674,0.014674,0.0,0.029349,0.0,0.0,0.0
1,0.0,0.053013,0.0,0.019566,0.0,0.0,0.0,0.019566,0.0,0.019566,0.019566,0.0,0.019566,0.0,0.0,0.053013
2,0.07952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07952,0.0,0.07952,0.0,0.0
