In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.util import bigrams
import numpy as np
import nltk

# Ensure NLTK dependencies are available
nltk.download('punkt')

# Sample corpus
documents = [
    "I love programming.",
    "Python programming is fun.",
    "I love Python.",
    "Programming challenges improve skills.",
    "Python is popular for data science."
]

# 1. One-Hot Encoding (OHE)
print("\n1. One-Hot Encoding (OHE):")
unique_words = set(word for doc in documents for word in nltk.word_tokenize(doc.lower()))
print("unique_words-",unique_words)
word_to_index = {word: idx for idx, word in enumerate(unique_words)}
print("word_to_index-",word_to_index)
print("Vocabulary:", word_to_index)

ohe = []
for doc in documents:
    tokens = nltk.word_tokenize(doc.lower())
    vector = [1 if word in tokens else 0 for word in word_to_index]
    ohe.append(vector)
print("OHE Vectors:")
for vector in ohe:
    print(vector)

# 2. Bag of Words (BOW)
print("\n2. Bag of Words (BOW):")
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(documents)
print("Vocabulary:", bow_vectorizer.get_feature_names_out())
print("BOW Matrix:\n", bow_matrix.toarray())

# 3. Bigrams
print("\n3. Bigrams:")
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_matrix = bigram_vectorizer.fit_transform(documents)
print("Bigrams:", bigram_vectorizer.get_feature_names_out())
print("Bigram Matrix:\n", bigram_matrix.toarray())

# 4. TF-IDF
print("\n4. TF-IDF:")
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:\n", np.round(tfidf_matrix.toarray(), 2))

# 5. Word2Vec
print("\n5. Word2Vec:")
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in documents]
word2vec_model = Word2Vec(sentences=tokenized_docs, vector_size=5, window=2, min_count=1, sg=0)
print("Word Vectors:")
for word in word_to_index.keys():
    print(f"{word}: {np.round(word2vec_model.wv[word], 2)}")



1. One-Hot Encoding (OHE):
unique_words- {'science', 'programming', 'challenges', 'is', 'data', 'fun', 'i', 'popular', 'python', 'for', 'improve', 'skills', '.', 'love'}
word_to_index- {'science': 0, 'programming': 1, 'challenges': 2, 'is': 3, 'data': 4, 'fun': 5, 'i': 6, 'popular': 7, 'python': 8, 'for': 9, 'improve': 10, 'skills': 11, '.': 12, 'love': 13}
Vocabulary: {'science': 0, 'programming': 1, 'challenges': 2, 'is': 3, 'data': 4, 'fun': 5, 'i': 6, 'popular': 7, 'python': 8, 'for': 9, 'improve': 10, 'skills': 11, '.': 12, 'love': 13}
OHE Vectors:
[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1]
[0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0]
[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1]
[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]
[1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0]

2. Bag of Words (BOW):
Vocabulary: ['challenges' 'data' 'for' 'fun' 'improve' 'is' 'love' 'popular'
 'programming' 'python' 'science' 'skills']
BOW Matrix:
 [[0 0 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 1 0 1 0 0 1 1 0 0]
 [0 0 0 0 0 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KhushiAnaya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl.metadata (8.2 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ---------------------------------------- 60.6/60.6 kB 1.6 MB/s eta 0:00:00
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.0-cp311-cp311-win_amd64.whl.metadata (6.5 kB)
Downloading gensim-4.3.3-cp311-cp311-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB 1.9 MB/s eta 0:00:13
   ---------------------------------------- 0.2/24.0 MB 2.6 MB/s eta 0:00:10
   -----------

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.



   ----------------------- ---------------- 26.7/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 26.7/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 26.7/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 26.9/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 26.9/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 27.1/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 27.1/46.2 MB 1.9 MB/s eta 0:00:11
   ----------------------- ---------------- 27.2/46.2 MB 1.8 MB/s eta 0:00:11
   ----------------------- ---------------- 27.3/46.2 MB 1.8 MB/s eta 0:00:11
   ----------------------- ---------------- 27.4/46.2 MB 1.8 MB/s eta 0:00:11
   ----------------------- ---------------- 27.5/46.2 MB 1.8 MB/s eta 0:00:11
   ----------------------- ---------------- 27.6/46.2 MB 1.9 MB/s eta 0:00:10
   ----------------------- ---------------- 27.7/46.2 MB 1.8 MB