<a href="https://colab.research.google.com/github/inderpreetsingh01/ml_machine_coding/blob/main/NLP_(BOW%2BTFIDF).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re
from collections import defaultdict

class BagOfWords:
    def __init__(self):
        self.vocab = {}
        self.inverse_vocab = {}

    def _tokenize(self, text):
        # Simple tokenizer: lowercase + split on non-alphanumeric
        tokens = re.findall(r"\b\w+\b", text.lower())
        return tokens

    def fit(self, documents):
        """Build vocabulary from list of documents"""
        vocab_set = set()
        for doc in documents:
            tokens = self._tokenize(doc)
            vocab_set.update(tokens)

        self.vocab = {word: idx for idx, word in enumerate(sorted(vocab_set))}
        self.inverse_vocab = {idx: word for word, idx in self.vocab.items()}
        return self

    def transform(self, documents):
        """Transform docs into BoW vectors"""
        rows = []
        for doc in documents:
            tokens = self._tokenize(doc)
            vector = np.zeros(len(self.vocab), dtype=int)
            for token in tokens:
                if token in self.vocab:
                    vector[self.vocab[token]] += 1
            rows.append(vector)
        return np.array(rows)

    def fit_transform(self, documents):
        """Fit vocab + transform docs"""
        self.fit(documents)
        return self.transform(documents)

In [2]:
docs = [
    "Data science is fun",
    "Machine learning is part of data science",
    "Python makes machine learning easy"
]

bow = BagOfWords()
X = bow.fit_transform(docs)

print("Vocabulary:", bow.vocab)
print("BoW Matrix:\n", X)


Vocabulary: {'data': 0, 'easy': 1, 'fun': 2, 'is': 3, 'learning': 4, 'machine': 5, 'makes': 6, 'of': 7, 'part': 8, 'python': 9, 'science': 10}
BoW Matrix:
 [[1 0 1 1 0 0 0 0 0 0 1]
 [1 0 0 1 1 1 0 1 1 0 1]
 [0 1 0 0 1 1 1 0 0 1 0]]


In [None]:
# TfIDF

In [3]:
import numpy as np
import re

class TFIDFVectorizer:
    def __init__(self):
        self.vocab = {}
        self.idf = None

    def _tokenize(self, text):
        return re.findall(r"\b\w+\b", text.lower())

    def fit(self, documents):
        """Build vocabulary & compute IDF"""
        vocab_set = set()
        tokenized_docs = [self._tokenize(doc) for doc in documents]

        # Build vocab
        for tokens in tokenized_docs:
            vocab_set.update(tokens)
        self.vocab = {word: idx for idx, word in enumerate(sorted(vocab_set))}

        # Compute IDF
        N = len(documents)
        df = np.zeros(len(self.vocab))

        for tokens in tokenized_docs:
            unique_tokens = set(tokens)
            for token in unique_tokens:
                df[self.vocab[token]] += 1

        # idf = log((N+1)/(df+1)) + 1 for smoothing
        self.idf = np.log((N + 1) / (df + 1)) + 1
        return self

    def transform(self, documents):
        """Transform docs into TF-IDF vectors"""
        tfidf_matrix = []

        for doc in documents:
            tokens = self._tokenize(doc)
            vec = np.zeros(len(self.vocab))

            # Term frequency (raw count)
            for token in tokens:
                if token in self.vocab:
                    vec[self.vocab[token]] += 1

            # Normalize TF by doc length
            if len(tokens) > 0:
                vec = vec / len(tokens)

            # Multiply by IDF
            vec = vec * self.idf
            tfidf_matrix.append(vec)

        return np.array(tfidf_matrix)

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)

In [4]:
docs = [
    "Data science is fun",
    "Machine learning is part of data science",
    "Python makes machine learning easy"
]

tfidf = TFIDFVectorizer()
X = tfidf.fit_transform(docs)

print("Vocabulary:", tfidf.vocab)
print("IDF:", np.round(tfidf.idf, 3))
print("TF-IDF Matrix:\n", np.round(X, 3))


Vocabulary: {'data': 0, 'easy': 1, 'fun': 2, 'is': 3, 'learning': 4, 'machine': 5, 'makes': 6, 'of': 7, 'part': 8, 'python': 9, 'science': 10}
IDF: [1.288 1.693 1.693 1.288 1.288 1.288 1.693 1.693 1.693 1.693 1.288]
TF-IDF Matrix:
 [[0.322 0.    0.423 0.322 0.    0.    0.    0.    0.    0.    0.322]
 [0.184 0.    0.    0.184 0.184 0.184 0.    0.242 0.242 0.    0.184]
 [0.    0.339 0.    0.    0.258 0.258 0.339 0.    0.    0.339 0.   ]]
