### Using sklearn library

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_unigrams_tfidf(corpus):

    vectorizer = TfidfVectorizer(ngram_range=(1, 1), smooth_idf=False)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

def get_bigrams_tfidf(corpus):
    vectorizer = TfidfVectorizer(ngram_range=(2, 2), smooth_idf=False)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

def get_trigrams_tfidf(corpus):
    vectorizer = TfidfVectorizer(ngram_range=(3, 3), smooth_idf=False)
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return tfidf_matrix, vectorizer

In [122]:
# Sample corpus
corpus = [
    "Machine learning is amazing",
    "Deep learning and machine learning are subsets of AI",
    "AI is transforming the world"
]

# Generate unigrams and their TF-IDF scores
unigram_tfidf_matrix, unigram_vectorizer = get_unigrams_tfidf(corpus)
print("Unigram Features:", unigram_vectorizer.get_feature_names_out())
print("Unigram TF-IDF Matrix:\n", unigram_tfidf_matrix.toarray())

# Generate bigrams and their TF-IDF scores
bigram_tfidf_matrix, bigram_vectorizer = get_bigrams_tfidf(corpus)
print("\nBigram Features:", bigram_vectorizer.get_feature_names_out())
print("Bigram TF-IDF Matrix:\n", bigram_tfidf_matrix.toarray())

# Generate trigrams and their TF-IDF scores
trigram_tfidf_matrix, trigram_vectorizer = get_trigrams_tfidf(corpus)
print("\nTrigram Features:", trigram_vectorizer.get_feature_names_out())
print("Trigram TF-IDF Matrix:\n", trigram_tfidf_matrix.toarray())

Unigram Features: ['ai' 'amazing' 'and' 'are' 'deep' 'is' 'learning' 'machine' 'of'
 'subsets' 'the' 'transforming' 'world']
Unigram TF-IDF Matrix:
 [[0.         0.65294782 0.         0.         0.         0.43728676
  0.43728676 0.43728676 0.         0.         0.         0.
  0.        ]
 [0.24148721 0.         0.36058385 0.36058385 0.36058385 0.
  0.48297442 0.24148721 0.36058385 0.36058385 0.         0.
  0.        ]
 [0.33925099 0.         0.         0.         0.         0.33925099
  0.         0.         0.         0.         0.50656277 0.50656277
  0.50656277]]

Bigram Features: ['ai is' 'and machine' 'are subsets' 'deep learning' 'is amazing'
 'is transforming' 'learning and' 'learning are' 'learning is'
 'machine learning' 'of ai' 'subsets of' 'the world' 'transforming the']
Bigram TF-IDF Matrix:
 [[0.         0.         0.         0.         0.63907044 0.
  0.         0.         0.63907044 0.42799292 0.         0.
  0.         0.        ]
 [0.         0.3664082  0.3664082  0

### Using python(writing each function)

In [None]:
import math

In [61]:
def generate_unigrams(text):
    # Tokenize the text into words
    words = text.lower().split()

    # Return the list of unigrams
    return words

In [62]:
def generate_ngrams(text, n):
    words = text.split()
    ngrams = []

    # Loop to generate n-grams
    for i in range(len(words) - n + 1):
        ngrams.append(tuple(words[i:i + n]))

    return ngrams

In [66]:
text = "this is a sample text for generating unigrams bigrams and trigrams"

# Generate Unigrams (n=1)
unigrams = generate_ngrams(text, 1)
print("Unigrams:", unigrams)

# Generate Bigrams (n=2)
bigrams = generate_ngrams(text, 2)
print("Bigrams:", bigrams)

# Generate Trigrams (n=3)
trigrams = generate_ngrams(text, 3)
print("Trigrams:", trigrams)

Unigrams: [('this',), ('is',), ('a',), ('sample',), ('text',), ('for',), ('generating',), ('unigrams',), ('bigrams',), ('and',), ('trigrams',)]
Bigrams: [('this', 'is'), ('is', 'a'), ('a', 'sample'), ('sample', 'text'), ('text', 'for'), ('for', 'generating'), ('generating', 'unigrams'), ('unigrams', 'bigrams'), ('bigrams', 'and'), ('and', 'trigrams')]
Trigrams: [('this', 'is', 'a'), ('is', 'a', 'sample'), ('a', 'sample', 'text'), ('sample', 'text', 'for'), ('text', 'for', 'generating'), ('for', 'generating', 'unigrams'), ('generating', 'unigrams', 'bigrams'), ('unigrams', 'bigrams', 'and'), ('bigrams', 'and', 'trigrams')]


In [77]:
def calculate_tf(text, n):
    ngrams = generate_ngrams(text, n)
    ngram_count = len(ngrams)
    tf = {}

    for ngram in ngrams:
        if ngram in tf:
            tf[ngram] += 1
        else:
            tf[ngram] = 1

    # Normalize by dividing by total n-gram count
    for ngram in tf:
        tf[ngram] = tf[ngram] / ngram_count

    return tf

In [78]:
import math

def calculate_idf(documents, n):
    total_documents = len(documents)
    ngram_document_count = {}

    for doc in documents:
        ngrams = set(generate_ngrams(doc, n))  # Set to avoid counting duplicate n-grams in the same document
        for ngram in ngrams:
            if ngram in ngram_document_count:
                ngram_document_count[ngram] += 1
            else:
                ngram_document_count[ngram] = 1

    idf = {}
    for ngram, count in ngram_document_count.items():
        idf[ngram] = math.log10(total_documents / (count))  # Logarithm base 10

    return idf

In [79]:
def calculate_tfidf(documents, n):
    tfidf = []

    # Calculate TF for each document
    tf_documents = [calculate_tf(doc, n) for doc in documents]

    # Calculate IDF for all documents
    idf = calculate_idf(documents, n)

    # Calculate TF-IDF for each document
    for doc in tf_documents:
        doc_tfidf = {}
        for ngram in doc:
            tfidf_value = doc[ngram] * idf.get(ngram, 0)  # Multiply TF by IDF for each n-gram
            doc_tfidf[ngram] = tfidf_value
        tfidf.append(doc_tfidf)

    return tfidf


In [80]:
documents = [
    "Machine learning is amazing",
    "Deep learning and machine learning are subsets of AI",
    "AI is transforming the world"
]

# Calculate TF-IDF for Unigrams (n=1)
print("Unigrams:")
tfidf_unigrams = calculate_tfidf(documents, 1)
for i, doc_tfidf in enumerate(tfidf_unigrams):
    print(f"Document {i+1} TF-IDF: {doc_tfidf}")

# Calculate TF-IDF for Bigrams (n=2)
print("\nBigrams:")
tfidf_bigrams = calculate_tfidf(documents, 2)
for i, doc_tfidf in enumerate(tfidf_bigrams):
    print(f"Document {i+1} TF-IDF: {doc_tfidf}")

# Calculate TF-IDF for Trigrams (n=3)
print("\nTrigrams:")
tfidf_trigrams = calculate_tfidf(documents, 3)
for i, doc_tfidf in enumerate(tfidf_trigrams):
    print(f"Document {i+1} TF-IDF: {doc_tfidf}")

Unigrams:
Document 1 TF-IDF: {('Machine',): 0.11928031367991561, ('learning',): 0.04402281476392031, ('is',): 0.04402281476392031, ('amazing',): 0.11928031367991561}
Document 2 TF-IDF: {('Deep',): 0.05301347274662915, ('learning',): 0.0391313909012625, ('and',): 0.05301347274662915, ('machine',): 0.05301347274662915, ('are',): 0.05301347274662915, ('subsets',): 0.05301347274662915, ('of',): 0.05301347274662915, ('AI',): 0.01956569545063125}
Document 3 TF-IDF: {('AI',): 0.03521825181113625, ('is',): 0.03521825181113625, ('transforming',): 0.09542425094393249, ('the',): 0.09542425094393249, ('world',): 0.09542425094393249}

Bigrams:
Document 1 TF-IDF: {('Machine', 'learning'): 0.15904041823988746, ('learning', 'is'): 0.15904041823988746, ('is', 'amazing'): 0.15904041823988746}
Document 2 TF-IDF: {('Deep', 'learning'): 0.059640156839957804, ('learning', 'and'): 0.059640156839957804, ('and', 'machine'): 0.059640156839957804, ('machine', 'learning'): 0.059640156839957804, ('learning', 'are'

In [81]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
import pandas as pd

# Sample documents
documents = [
    "Machine learning is amazing",
    "Deep learning and machine learning are subsets of AI",
    "AI is transforming the world"
]

# Function to compute Term Frequency (TF)
def compute_tf(documents, ngram_range=(1, 3)):
    vectorizer = CountVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    tf_matrix = X.toarray()
    return tf_matrix, feature_names

# Function to compute Inverse Document Frequency (IDF)
def compute_idf(documents, ngram_range=(1, 3)):
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, use_idf=True, smooth_idf=False)
    vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    idf_values = vectorizer.idf_
    return dict(zip(feature_names, idf_values))

# Function to compute TF-IDF
def compute_tfidf(documents, ngram_range=(1, 3)):
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    X = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_matrix = X.toarray()
    return tfidf_matrix, feature_names

In [82]:
# Compute TF, IDF, and TF-IDF
tf_matrix, feature_names = compute_tf(documents)
idf_dict = compute_idf(documents)
tfidf_matrix, _ = compute_tfidf(documents)

In [83]:
# Display TF
print("\n--- Term Frequency (TF) ---")
tf_df = pd.DataFrame(tf_matrix, columns=feature_names)
print(tf_df)


--- Term Frequency (TF) ---
   ai  ai is  ai is transforming  amazing  and  and machine  \
0   0      0                   0        1    0            0   
1   1      0                   0        0    1            1   
2   1      1                   1        0    0            0   

   and machine learning  are  are subsets  are subsets of  ...  of ai  \
0                     0    0            0               0  ...      0   
1                     1    1            1               1  ...      1   
2                     0    0            0               0  ...      0   

   subsets  subsets of  subsets of ai  the  the world  transforming  \
0        0           0              0    0          0             0   
1        1           1              1    0          0             0   
2        0           0              0    1          1             1   

   transforming the  transforming the world  world  
0                 0                       0      0  
1                 0               

In [84]:
# Display IDF
print("\n--- Inverse Document Frequency (IDF) ---")
idf_df = pd.DataFrame(list(idf_dict.items()), columns=["N-Gram", "IDF Score"])
print(idf_df)


--- Inverse Document Frequency (IDF) ---
                    N-Gram  IDF Score
0                       ai   1.405465
1                    ai is   2.098612
2       ai is transforming   2.098612
3                  amazing   2.098612
4                      and   2.098612
5              and machine   2.098612
6     and machine learning   2.098612
7                      are   2.098612
8              are subsets   2.098612
9           are subsets of   2.098612
10                    deep   2.098612
11           deep learning   2.098612
12       deep learning and   2.098612
13                      is   1.405465
14              is amazing   2.098612
15         is transforming   2.098612
16     is transforming the   2.098612
17                learning   1.405465
18            learning and   2.098612
19    learning and machine   2.098612
20            learning are   2.098612
21    learning are subsets   2.098612
22             learning is   2.098612
23     learning is amazing   2.098612
24      

In [85]:
# Display TF-IDF
print("\n--- TF-IDF Scores ---")
tfidf_df = pd.DataFrame(tfidf_matrix, columns=feature_names)
print(tfidf_df)


--- TF-IDF Scores ---
         ai     ai is  ai is transforming   amazing       and  and machine  \
0  0.000000  0.000000            0.000000  0.369772  0.000000     0.000000   
1  0.158413  0.000000            0.000000  0.000000  0.208294     0.208294   
2  0.227690  0.299385            0.299385  0.000000  0.000000     0.000000   

   and machine learning       are  are subsets  are subsets of  ...     of ai  \
0              0.000000  0.000000     0.000000        0.000000  ...  0.000000   
1              0.208294  0.208294     0.208294        0.208294  ...  0.208294   
2              0.000000  0.000000     0.000000        0.000000  ...  0.000000   

    subsets  subsets of  subsets of ai       the  the world  transforming  \
0  0.000000    0.000000       0.000000  0.000000   0.000000      0.000000   
1  0.208294    0.208294       0.208294  0.000000   0.000000      0.000000   
2  0.000000    0.000000       0.000000  0.299385   0.299385      0.299385   

   transforming the  transform