In [1]:
import math

# Function to generate n-grams from a list of tokens
def generate_ngrams(tokens, n):
    """Return list of n-grams generated from the given tokens."""
    return [' '.join(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

In [2]:
corpus = [
    "This is a sample document.",
    "This document is another example document.",
    "TF-IDF is a useful technique for text analysis."
]

docs_ngrams = []
for doc in corpus:
    # Simple tokenization: lowercasing and splitting on whitespace
    tokens = doc.lower().split()
    # Remove punctuation (optional simple cleaning)
    tokens = [token.strip(".,!?") for token in tokens]

    # Generate unigrams, bigrams, and trigrams
    ngrams = []
    for n in range(1, 4):  # 1, 2, and 3
        ngrams.extend(generate_ngrams(tokens, n))
    docs_ngrams.append(ngrams)

In [3]:
vocab = set()
for ngrams in docs_ngrams:
    vocab.update(ngrams)
vocab = sorted(list(vocab))
print("Vocabulary (n-grams):")
print(vocab)

Vocabulary (n-grams):
['a', 'a sample', 'a sample document', 'a useful', 'a useful technique', 'analysis', 'another', 'another example', 'another example document', 'document', 'document is', 'document is another', 'example', 'example document', 'for', 'for text', 'for text analysis', 'is', 'is a', 'is a sample', 'is a useful', 'is another', 'is another example', 'sample', 'sample document', 'technique', 'technique for', 'technique for text', 'text', 'text analysis', 'tf-idf', 'tf-idf is', 'tf-idf is a', 'this', 'this document', 'this document is', 'this is', 'this is a', 'useful', 'useful technique', 'useful technique for']


In [4]:
# Compute term frequency (TF) for each document as a dictionary {term: frequency}
doc_tf = []
for ngrams in docs_ngrams:
    tf = {}
    for term in ngrams:
        tf[term] = tf.get(term, 0) + 1
    doc_tf.append(tf)

# Compute document frequency (DF): count in how many documents each term appears
df = {}
for tf in doc_tf:
    for term in tf.keys():
        df[term] = df.get(term, 0) + 1

N = len(corpus)
# Compute Inverse Document Frequency (IDF) for each term using smoothing:
# idf = log((N+1)/(df+1)) + 1
idf = {term: math.log((N + 1) / (df_count + 1)) + 1 for term, df_count in df.items()}

# Compute TF-IDF for each document
doc_tfidf = []
for tf in doc_tf:
    tfidf = {}
    for term, freq in tf.items():
        tfidf[term] = freq * idf[term]
    doc_tfidf.append(tfidf)


In [5]:
# Display the TF-IDF values for each document
print("\nTF-IDF values for each document (pure python):")
for i, tfidf in enumerate(doc_tfidf):
    print(f"\nDocument {i+1}:")
    for term, score in sorted(tfidf.items()):
        print(f"  {term}: {score:.4f}")



TF-IDF values for each document (pure python):

Document 1:
  a: 1.2877
  a sample: 1.6931
  a sample document: 1.6931
  document: 1.2877
  is: 1.0000
  is a: 1.2877
  is a sample: 1.6931
  sample: 1.6931
  sample document: 1.6931
  this: 1.2877
  this is: 1.6931
  this is a: 1.6931

Document 2:
  another: 1.6931
  another example: 1.6931
  another example document: 1.6931
  document: 2.5754
  document is: 1.6931
  document is another: 1.6931
  example: 1.6931
  example document: 1.6931
  is: 1.0000
  is another: 1.6931
  is another example: 1.6931
  this: 1.2877
  this document: 1.6931
  this document is: 1.6931

Document 3:
  a: 1.2877
  a useful: 1.6931
  a useful technique: 1.6931
  analysis: 1.6931
  for: 1.6931
  for text: 1.6931
  for text analysis: 1.6931
  is: 1.0000
  is a: 1.2877
  is a useful: 1.6931
  technique: 1.6931
  technique for: 1.6931
  technique for text: 1.6931
  text: 1.6931
  text analysis: 1.6931
  tf-idf: 1.6931
  tf-idf is: 1.6931
  tf-idf is a: 1.6931
  us

USINK SK-LEARN LIBRARY


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus: same as above
corpus = [
    "This is a sample document.",
    "This document is another example document.",
    "TF-IDF is a useful technique for text analysis."
]

# Create the TfidfVectorizer with ngram_range=(1, 3)
vectorizer = TfidfVectorizer(ngram_range=(1, 3))

# Fit the model and transform the corpus into a TF-IDF weighted document-term matrix
tfidf_matrix = vectorizer.fit_transform(corpus)

# Get the list of n-gram feature names
feature_names = vectorizer.get_feature_names_out()

# Display the feature names
print("Feature names (n-grams) using sklearn:")
print(feature_names)

# Display the shape of the TF-IDF matrix
print("\nTF-IDF Matrix shape:", tfidf_matrix.shape)

# Convert the TF-IDF matrix to an array and print it
print("\nTF-IDF Matrix (array format):")
print(tfidf_matrix.toarray())


Feature names (n-grams) using sklearn:
['analysis' 'another' 'another example' 'another example document'
 'document' 'document is' 'document is another' 'example'
 'example document' 'for' 'for text' 'for text analysis' 'idf' 'idf is'
 'idf is useful' 'is' 'is another' 'is another example' 'is sample'
 'is sample document' 'is useful' 'is useful technique' 'sample'
 'sample document' 'technique' 'technique for' 'technique for text' 'text'
 'text analysis' 'tf' 'tf idf' 'tf idf is' 'this' 'this document'
 'this document is' 'this is' 'this is sample' 'useful' 'useful technique'
 'useful technique for']

TF-IDF Matrix shape: (3, 40)

TF-IDF Matrix (array format):
[[0.         0.         0.         0.         0.27760064 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.21558166 0.         0.
  0.36501149 0.36501149 0.         0.         0.36501149 0.36501149
  0.         0.         0.         0.         0.         0.
  0.         0.      

In [8]:
corpus

['This is a sample document.',
 'This document is another example document.',
 'TF-IDF is a useful technique for text analysis.']