# 3.1 Bag of Words

3.1.2 Implementing Bag of Words in Python


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text corpus
documents = [
    "Natural language processing is fun",
    "Language models are important in NLP"
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on the text data
X = vectorizer.fit_transform(documents)

# Convert the result to an array
bow_array = X.toarray()

# Get the feature names (vocabulary)
vocab = vectorizer.get_feature_names_out()

print("Vocabulary:")
print(vocab)

print("\\nBag of Words Array:")
print(bow_array)

3.1.4 Practical Example: Text Classification with Bag of Words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample text corpus and labels
documents = [
    "Natural language processing is fun",
    "Language models are important in NLP",
    "I enjoy learning about artificial intelligence",
    "Machine learning and NLP are closely related",
    "Deep learning is a subset of machine learning"
]
labels = [1, 1, 0, 1, 0]  # 1 for NLP-related, 0 for AI-related

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Transform the text data
X = vectorizer.fit_transform(documents)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize the classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

# 3.2 TF-IDFF

3.2.3 Implementing TF-IDF in Python

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text corpus
documents = [
    "Natural language processing is fun",
    "Language models are important in NLP",
    "I enjoy learning about artificial intelligence",
    "Machine learning and NLP are closely related",
    "Deep learning is a subset of machine learning"
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer on the text data
X = vectorizer.fit_transform(documents)

# Convert the result to an array
tfidf_array = X.toarray()

# Get the feature names (vocabulary)
vocab = vectorizer.get_feature_names_out()

print("Vocabulary:")
print(vocab)

print("\\nTF-IDF Array:")
print(tfidf_array)

3.2.4 Practical Example: Text Classification with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample text corpus and labels
documents = [
    "Natural language processing is fun",
    "Language models are important in NLP",
    "I enjoy learning about artificial intelligence",
    "Machine learning and NLP are closely related",
    "Deep learning is a subset of machine learning"
]
labels = [1, 1, 0, 1, 0]  # 1 for NLP-related, 0 for AI-related

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform the text data
X = vectorizer.fit_transform(documents)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

# Initialize the classifier
classifier = MultinomialNB()

# Train the classifier
classifier.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = classifier.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

# 3.3 Word Embeddings (Word2Vec, GloVe)

3.3.2 Word2Vec

In [None]:
!pip install gensim

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

# Sample text corpus
text = "Natural language processing is fun and exciting. Language models are important in NLP. I enjoy learning about artificial intelligence. Machine learning and NLP are closely related. Deep learning is a subset of machine learning."

# Tokenize the text into sentences
sentences = sent_tokenize(text)

# Tokenize each sentence into words
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

# Train a Word2Vec model using the Skip-Gram method
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, sg=1, min_count=1)

# Get the vector representation of the word "language"
vector = model.wv['language']
print("Vector representation of 'language':")
print(vector)

# Find the most similar words to "language"
similar_words = model.wv.most_similar('language')
print("\\nMost similar words to 'language':")
print(similar_words)

3.3.3 GloVe (Global Vectors for Word Representation)

In [None]:
import gensim.downloader as api

# Load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-100")

# Get the vector representation of the word "language"
vector = glove_model['language']
print("Vector representation of 'language':")
print(vector)

# Find the most similar words to "language"
similar_words = glove_model.most_similar('language')
print("\\nMost similar words to 'language':")
print(similar_words)

# 3.4 Introduction to BERT Embeddings


3.4.3 Implementing BERT Embeddings in Python

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Sample text
text = "Natural Language Processing is fascinating."

# Tokenize the text
inputs = tokenizer(text, return_tensors='pt')

# Generate BERT embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Get the embeddings for the [CLS] token (representing the entire input text)
cls_embeddings = outputs.last_hidden_state[:, 0, :]

print("BERT Embeddings for the text:")
print(cls_embeddings)

3.4.4 Fine-tuning BERT for Specific Tasks

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

# Sample text corpus and labels
documents = [
    "Natural Language Processing is fascinating.",
    "Machine learning models are essential for AI.",
    "I love learning about deep learning.",
    "NLP and AI are closely related fields.",
    "Artificial Intelligence is transforming industries."
]
labels = [1, 0, 1, 1, 0]  # 1 for NLP-related, 0 for AI-related

# Load pre-trained BERT tokenizer and model for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize the text data
inputs = tokenizer(documents, padding=True, truncation=True, return_tensors='pt')

# Create a dataset class
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.inputs.items()}
        item['labels'] = self.labels[idx]
        return item

# Split the data into training and testing sets
# Split inputs and labels separately
train_indices, test_indices = train_test_split(range(len(documents)), test_size=0.2, random_state=42)

train_inputs = {key: inputs[key][train_indices] for key in inputs.keys()}
test_inputs = {key: inputs[key][test_indices] for key in inputs.keys()}
train_labels = [labels[i] for i in train_indices]
test_labels = [labels[i] for i in test_indices]


train_dataset = TextDataset(train_inputs, train_labels)
test_dataset = TextDataset(test_inputs, test_labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=10,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation results:")
print(results)

# Chapter-3 Assignments

Exercise 1: Bag of Words

In [None]:
documents = [
    "Text processing is important for NLP.",
    "Bag of Words is a simple text representation method.",
    "Feature engineering is essential in machine learning."
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text corpus
documents = [
    "Text processing is important for NLP.",
    "Bag of Words is a simple text representation method.",
    "Feature engineering is essential in machine learning."
]

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Transform the text data
X = vectorizer.fit_transform(documents)

# Convert the result to an array
bow_array = X.toarray()

# Get the feature names (vocabulary)
vocab = vectorizer.get_feature_names_out()

print("Vocabulary:")
print(vocab)

print("\nBag of Words Array:")
print(bow_array)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  


Exercise 2: TF-IDF

In [None]:
documents = [
    "Natural language processing is fun.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text corpus
documents = [
    "Natural language processing is fun.",
    "Language models are important in NLP.",
    "Machine learning and NLP are closely related."
]

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Transform the text data
X = vectorizer.fit_transform(documents)

# Convert the result to an array
tfidf_array = X.toarray()

# Get the feature names (vocabulary)
vocab = vectorizer.get_feature_names_out()

print("Vocabulary:")
print(vocab)

print("\nTF-IDF Array:")
print(tfidf_array)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  


Exercise 3: Word2Vec

In [None]:
text = "Natural language processing is fun and exciting. Language models are important in NLP. Machine learning and NLP are closely related."

In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
nltk.download('punkt')

# Sample text corpus
text = "Natural language processing is fun and exciting. Language models are important in NLP. Machine learning and NLP are closely related."

# Tokenize the text into sentences
sentences = sent_tokenize(text)

# Tokenize each sentence into words
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

# Train a Word2Vec model using the Skip-Gram method
model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, sg=1, min_count=1)

# Get the vector representation of the word "NLP"
vector = model.wv['NLP']
print("Vector representation of 'NLP':")
print(vector)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  


Exercise 4: GloVe

In [None]:
import gensim.downloader as api

# Load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-100")

# Get the vector representation of the word "machine"
vector = glove_model['machine']
print("Vector representation of 'machine':")
print(vector)

# Find the most similar words to "machine"
similar_words = glove_model.most_similar('machine')
print("\nMost similar words to 'machine':")
print(similar_words)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  


Exercise 5: BERT Embeddings

In [None]:
text = "Transformers are powerful models for NLP tasks."

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Sample text
text = "Transformers are powerful models for NLP tasks."

# Tokenize the text
inputs = tokenizer(text, return_tensors='pt')

# Generate BERT embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Get the embeddings for the [CLS] token (representing the entire input text)
cls_embeddings = outputs.last_hidden_state[:, 0, :]

print("BERT Embeddings for the text:")
print(cls_embeddings)

**Explain the code snippet above in detail. **

___

**Type Your ResponseBelow:**  
