Bag-of-Words (BoW)

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
    "Barack Obama was the 44th President of the United States",
    "The President lives in the White House",
    "The United States has a strong economy"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the documents into a Bag of Words
bow_matrix = vectorizer.fit_transform(documents)

# Get the feature names (unique words in the corpus)
feature_names = vectorizer.get_feature_names_out()

# Convert the Bag of Words matrix into an array
bow_array = bow_matrix.toarray()

# Display the Bag of Words
print("Feature Names (Words):", feature_names)
print("\nBag of Words Representation:")
print(bow_array)

Feature Names (Words): ['44th' 'barack' 'economy' 'has' 'house' 'in' 'lives' 'obama' 'of'
 'president' 'states' 'strong' 'the' 'united' 'was' 'white']

Bag of Words Representation:
[[1 1 0 0 0 0 0 1 1 1 1 0 2 1 1 0]
 [0 0 0 0 1 1 1 0 0 1 0 0 2 0 0 1]
 [0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0]]


Term Frequency-Inverse Document Frequency (TF-IDF)

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text data
documents = [
    "Barack Obama was the 44th President of the United States",
    "The President lives in the White House",
    "The United States has a strong economy"
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the documents into a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (unique words in the corpus)
feature_names = vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix into an array
tfidf_array = tfidf_matrix.toarray()

# Display the TF-IDF representation
print("Feature Names (Words):", feature_names)
print("\nTF-IDF Representation:")
print(tfidf_array)


Feature Names (Words): ['44th' 'barack' 'economy' 'has' 'house' 'in' 'lives' 'obama' 'of'
 'president' 'states' 'strong' 'the' 'united' 'was' 'white']

TF-IDF Representation:
[[0.35070436 0.35070436 0.         0.         0.         0.
  0.         0.35070436 0.35070436 0.2667197  0.2667197  0.
  0.41426329 0.2667197  0.35070436 0.        ]
 [0.         0.         0.         0.         0.40914568 0.40914568
  0.40914568 0.         0.         0.31116583 0.         0.
  0.48329606 0.         0.         0.40914568]
 [0.         0.         0.4711101  0.4711101  0.         0.
  0.         0.         0.         0.         0.35829137 0.4711101
  0.27824521 0.35829137 0.         0.        ]]


 N-grams

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Örnek metinler
documents = [
    "Barack Obama was the 44th President of the United States",
    "The President lives in the White House",
    "The United States has a strong economy"
]

# **Unigram (1-gram)**
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_matrix = unigram_vectorizer.fit_transform(documents)
unigram_features = unigram_vectorizer.get_feature_names_out()
unigram_array = unigram_matrix.toarray()

print("Unigram Features:", unigram_features)
print("\nUnigram Representation:")
print(unigram_array)

print("\n" + "="*80 + "\n")

# **Bigram (2-gram)**
bigram_vectorizer = CountVectorizer(ngram_range=(2, 2))
bigram_matrix = bigram_vectorizer.fit_transform(documents)
bigram_features = bigram_vectorizer.get_feature_names_out()
bigram_array = bigram_matrix.toarray()

print("Bigram Features:", bigram_features)
print("\nBigram Representation:")
print(bigram_array)

print("\n" + "="*80 + "\n")

# **Trigram (3-gram)**
trigram_vectorizer = CountVectorizer(ngram_range=(3, 3))
trigram_matrix = trigram_vectorizer.fit_transform(documents)
trigram_features = trigram_vectorizer.get_feature_names_out()
trigram_array = trigram_matrix.toarray()

print("Trigram Features:", trigram_features)
print("\nTrigram Representation:")
print(trigram_array)


Unigram Features: ['44th' 'barack' 'economy' 'has' 'house' 'in' 'lives' 'obama' 'of'
 'president' 'states' 'strong' 'the' 'united' 'was' 'white']

Unigram Representation:
[[1 1 0 0 0 0 0 1 1 1 1 0 2 1 1 0]
 [0 0 0 0 1 1 1 0 0 1 0 0 2 0 0 1]
 [0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 0]]


Bigram Features: ['44th president' 'barack obama' 'has strong' 'in the' 'lives in'
 'obama was' 'of the' 'president lives' 'president of' 'states has'
 'strong economy' 'the 44th' 'the president' 'the united' 'the white'
 'united states' 'was the' 'white house']

Bigram Representation:
[[1 1 0 0 0 1 1 0 1 0 0 1 0 1 0 1 1 0]
 [0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 0 1]
 [0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0]]


Trigram Features: ['44th president of' 'barack obama was' 'has strong economy'
 'in the white' 'lives in the' 'obama was the' 'of the united'
 'president lives in' 'president of the' 'states has strong'
 'the 44th president' 'the president lives' 'the united states'
 'the white house' 'united states has' 'was the

Word Embeddings

In [7]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scipy, gensim
  Attempting uninstall: scipy
    Found existing installation: scipy 1.14.1
    Uninstalling scipy-1.14.1:
      Successfully 

In [10]:
import nltk
import gensim
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import word_tokenize

# Download the 'punkt_tab' data
nltk.download('punkt_tab')

# Örnek metinler
documents = [
    "Barack Obama was the 44th President of the United States",
    "The President lives in the White House",
    "The United States has a strong economy"
]

# NLTK tokenizasyon
tokenized_text = [word_tokenize(doc.lower()) for doc in documents]

# Bigram ve Trigram modelleri oluştur
bigram = Phrases(tokenized_text, min_count=1, threshold=2)
trigram = Phrases(bigram[tokenized_text], min_count=1, threshold=2)

bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# Bigram ve Trigramlı cümleleri oluştur
bigram_text = [bigram_mod[sent] for sent in tokenized_text]
trigram_text = [trigram_mod[bigram_mod[sent]] for sent in tokenized_text]

# Unigram (tekil kelimeler) için Word2Vec modeli
unigram_model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=0)

# Bigram için Word2Vec modeli
bigram_model = Word2Vec(bigram_text, vector_size=100, window=5, min_count=1, sg=0)

# Trigram için Word2Vec modeli
trigram_model = Word2Vec(trigram_text, vector_size=100, window=5, min_count=1, sg=0)

# Belirli bir kelimenin vektörünü al (örneğin "president")
word = "president"
if word in unigram_model.wv:
    print(f"\n🔹 Unigram Word2Vec Vektörü ({word}):\n", unigram_model.wv[word])

if word in bigram_model.wv:
    print(f"\n🔹 Bigram Word2Vec Vektörü ({word}):\n", bigram_model.wv[word])

if word in trigram_model.wv:
    print(f"\n🔹 Trigram Word2Vec Vektörü ({word}):\n", trigram_model.wv[word])

# Kelimeye en yakın 3 kelimeyi bul
print("\n🔹 En Benzer Kelimeler:")
print("Unigram:", unigram_model.wv.most_similar(word, topn=3))
print("Bigram:", bigram_model.wv.most_similar(word, topn=3))
print("Trigram:", trigram_model.wv.most_similar(word, topn=3))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



🔹 Unigram Word2Vec Vektörü (president):
 [-8.2426779e-03  9.2993546e-03 -1.9766092e-04 -1.9672764e-03
  4.6036304e-03 -4.0953159e-03  2.7431143e-03  6.9399667e-03
  6.0654259e-03 -7.5107943e-03  9.3823504e-03  4.6718083e-03
  3.9661205e-03 -6.2435055e-03  8.4599797e-03 -2.1501649e-03
  8.8251876e-03 -5.3620026e-03 -8.1294188e-03  6.8245591e-03
  1.6711927e-03 -2.1985089e-03  9.5136007e-03  9.4938548e-03
 -9.7740470e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227872e-03  4.3050171e-04  6.7363144e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888723e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759365e-03 -9.4026709e-03  9.7643770e-03
  3.4297847e-03  5.1661171e-03  6.2823449e-03 -2.8042626e-03
  7.3227035e-03  2.8302716e-03  2.8710044e-03 -2.3803699e-03
 -3.1282497e-03 -2.3701417e-03  4.2764368e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481940e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843508e-03 -4.2906962e-03
  2.7831673e-04  4.9643586e-03  7.6983096e-

In [11]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Sample text data
text = [
    "The cat sat on the mat",
    "The dog barked at the cat",
    "The cat chased the mouse",
    "The dog chased the cat",
]

# Tokenize the sentences into words
tokenized_text = [word_tokenize(sentence.lower()) for sentence in text]

# Train a Word2Vec model on the tokenized text
model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=1, sg=0)

# Get the word embeddings for a specific word
cat_vector = model.wv['cat']

# Print the word embedding for 'cat'
print("Word Embedding for 'cat':")
print(cat_vector)

# Find words most similar to 'cat'
similar_words = model.wv.most_similar('cat', topn=3)
print("\nWords most similar to 'cat':")
print(similar_words)

Word Embedding for 'cat':
[-8.6196875e-03  3.6657380e-03  5.1898835e-03  5.7419385e-03
  7.4669183e-03 -6.1676754e-03  1.1056137e-03  6.0472824e-03
 -2.8400505e-03 -6.1735227e-03 -4.1022300e-04 -8.3689485e-03
 -5.6000124e-03  7.1045388e-03  3.3525396e-03  7.2256695e-03
  6.8002474e-03  7.5307419e-03 -3.7891543e-03 -5.6180597e-04
  2.3483764e-03 -4.5190323e-03  8.3887316e-03 -9.8581640e-03
  6.7646410e-03  2.9144168e-03 -4.9328315e-03  4.3981876e-03
 -1.7395747e-03  6.7113843e-03  9.9648498e-03 -4.3624435e-03
 -5.9933780e-04 -5.6956373e-03  3.8508223e-03  2.7866268e-03
  6.8910765e-03  6.1010956e-03  9.5384968e-03  9.2734173e-03
  7.8980681e-03 -6.9895042e-03 -9.1558648e-03 -3.5575271e-04
 -3.0998408e-03  7.8943167e-03  5.9385742e-03 -1.5456629e-03
  1.5109634e-03  1.7900408e-03  7.8175711e-03 -9.5101865e-03
 -2.0553112e-04  3.4691966e-03 -9.3897223e-04  8.3817719e-03
  9.0107834e-03  6.5365066e-03 -7.1162102e-04  7.7104042e-03
 -8.5343346e-03  3.2071066e-03 -4.6379971e-03 -5.0889552e-0

 Contextual Word Embeddings

In [12]:
from transformers import BertModel, BertTokenizer
import torch

# BERT modelini ve tokenizer'ı yükle
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Örnek metinler
documents = [
    "Barack Obama was the 44th President of the United States",
    "The President lives in the White House",
    "The United States has a strong economy"
]

# Tokenizasyon ve tensor dönüşümü
input_ids = tokenizer(documents, padding=True, truncation=True, return_tensors="pt")

# BERT modelini çalıştır ve kelime gömmeleri al
with torch.no_grad():
    outputs = model(**input_ids)
    last_hidden_states = outputs.last_hidden_state  # [batch_size, max_seq_len, 768]

# Tokenleri geri çevir (Her kelimeye karşılık gelen embedding'leri görmek için)
tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in input_ids["input_ids"]]

# Sonuçları yazdır
for i, doc_tokens in enumerate(tokens):
    print(f"\n🔹 Cümle {i+1}: {documents[i]}")
    for token, embedding in zip(doc_tokens, last_hidden_states[i]):
        print(f"Token: {token} -> Embedding: {embedding[:10]}...")  # İlk 10 değer


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]


🔹 Cümle 1: Barack Obama was the 44th President of the United States
Token: [CLS] -> Embedding: tensor([-0.1915,  0.0634,  0.1325,  0.2343,  0.1480,  0.1961,  0.0894,  0.1599,
        -0.1584, -0.4145])...
Token: barack -> Embedding: tensor([-0.3747,  0.3885,  0.0667, -0.2749,  0.4214, -0.0536,  0.6001,  0.1300,
         0.0377, -1.1323])...
Token: obama -> Embedding: tensor([-0.5493,  0.1698,  0.4220, -0.2409,  0.2777, -0.0703,  0.4664,  0.4082,
        -0.7677, -1.1666])...
Token: was -> Embedding: tensor([-1.0784,  0.2176, -0.2877,  0.5525, -0.2535, -0.0989,  0.0915,  0.6725,
         0.1130, -0.0103])...
Token: the -> Embedding: tensor([-1.4241,  0.3246, -0.2715,  0.1997, -0.2132, -0.4460, -0.0388,  0.6238,
        -0.0474,  0.3303])...
Token: 44th -> Embedding: tensor([-0.7871,  0.5098,  0.6867, -0.1761, -0.4429,  0.1356,  0.8269, -0.1106,
         1.3319, -0.1486])...
Token: president -> Embedding: tensor([-1.2219,  0.8395, -0.4667, -0.3563, -0.2436, -0.7767,  0.3762,  1.0753,
  