# NLP Feature Engineering

##  Bag of Words (BoW) - Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
docs = ["the cat dog sat on the mat", "the dog sat on the log"]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the documents
bow_matrix = vectorizer.fit_transform(docs)

# Convert to array and show the matrix
print(bow_matrix.toarray())


[[1 1 0 1 1 1 2]
 [0 1 1 0 1 1 2]]


## Term Frequency-Inverse Document Frequency (TF-IDF)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
docs = ["the cat sat on the mat", "the dog sat on the log"]

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

# Convert to array and show the matrix
print(tfidf_matrix.toarray())


[[0.44554752 0.         0.         0.44554752 0.31701073 0.31701073
  0.63402146]
 [0.         0.44554752 0.44554752 0.         0.31701073 0.31701073
  0.63402146]]


## Word2Vec Embedding


In [None]:
from gensim.models import Word2Vec

# Sample sentences split into words
sentences = [["the", "cat", "sat", "on", "the", "mat"], ["the", "dog", "sit", "on", "the", "log"]]

# Initialize and train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=2)

# Retrieve the vector for a word
print(model.wv['sit'])


[-8.2426779e-03  9.2993546e-03 -1.9766092e-04 -1.9672764e-03
  4.6036304e-03 -4.0953159e-03  2.7431143e-03  6.9399667e-03
  6.0654259e-03 -7.5107943e-03  9.3823504e-03  4.6718083e-03
  3.9661205e-03 -6.2435055e-03  8.4599797e-03 -2.1501649e-03
  8.8251876e-03 -5.3620026e-03 -8.1294188e-03  6.8245591e-03
  1.6711927e-03 -2.1985089e-03  9.5136007e-03  9.4938548e-03
 -9.7740470e-03  2.5052286e-03  6.1566923e-03  3.8724565e-03
  2.0227872e-03  4.3050171e-04  6.7363144e-04 -3.8206363e-03
 -7.1402504e-03 -2.0888723e-03  3.9238976e-03  8.8186832e-03
  9.2591504e-03 -5.9759365e-03 -9.4026709e-03  9.7643770e-03
  3.4297847e-03  5.1661171e-03  6.2823449e-03 -2.8042626e-03
  7.3227035e-03  2.8302716e-03  2.8710044e-03 -2.3803699e-03
 -3.1282497e-03 -2.3701417e-03  4.2764368e-03  7.6057913e-05
 -9.5842788e-03 -9.6655441e-03 -6.1481940e-03 -1.2856961e-04
  1.9974159e-03  9.4319675e-03  5.5843508e-03 -4.2906962e-03
  2.7831673e-04  4.9643586e-03  7.6983096e-03 -1.1442233e-03
  4.3234206e-03 -5.81437

## SentenceBert


In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m81.9/86.0 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_trans

In [None]:
from sentence_transformers import SentenceTransformer

# Sample sentences
sentences = ["the cat sat on the mat", "the dog sat on the log"]

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Generate sentence embeddings
sentence_embeddings = model.encode(sentences)

# Show the embeddings
for sentence, embedding in zip(sentences, sentence_embeddings):
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding[:10]}...")  # Showing first 5 values of the embedding


.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Sentence: the cat sat on the mat
Embedding: [ 1.3319564e-01 -2.1662936e-03  2.3041981e-01 -4.5560020e-01
 -4.4081384e-01 -2.3928045e-01 -6.0502440e-05  1.5062129e-01
 -7.6896137e-01 -4.3565604e-01]...
Sentence: the dog sat on the log
Embedding: [ 0.08628204  0.22408552  0.02871142  0.03368748 -0.2673451   0.63837165
 -0.599319   -0.30652738 -0.81996435 -0.3138151 ]...


## GloVe Embeddings

For GloVe embeddings, there isn't a direct Python library like gensim for Word2Vec or sentence-transformers for SentenceBert. However, you can load pre-trained GloVe vectors using the gensim library:



In [None]:
import gensim.downloader as api

# Load the GloVe model
glove_model = api.load("glove-twitter-25")

# Retrieve the vector for a word
print(glove_model['cat'])


[-0.96419  -0.60978   0.67449   0.35113   0.41317  -0.21241   1.3796
  0.12854   0.31567   0.66325   0.3391   -0.18934  -3.325    -1.1491
 -0.4129    0.2195    0.8706   -0.50616  -0.12781  -0.066965  0.065761
  0.43927   0.1758   -0.56058   0.13529 ]


Alternatively, if you have downloaded the GloVe vectors, you can load them manually

# NLP Tasks

## Text Classification

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Sample data
texts = ["I love this phone", "This movie is great", "Dislike this book", "This restaurant is awful", 'I am feeling awful']
labels = [1, 1, 0, 0, 0]  # 1 for Positive, 0 for Negative

# Convert text data into numerical vectors
vectorizer = CountVectorizer()
text_vectors = vectorizer.fit_transform(texts)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(text_vectors, labels, test_size=0.2)

# Train a Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Test the classifier
predictions = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, predictions))


Accuracy: 0.0


## Named Entity Recognition

In [22]:
import spacy

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm")

# Sample text
text = "Apple Inc. was founded by Jobs Denver and Steve Wozniak in Cupertino, Dhaka."

# Process the text
doc = nlp(text)

# Iterate over the recognized entities
for ent in doc.ents:
    print(ent.text, ent.label_)


Apple Inc. ORG
Jobs Denver PERSON
Steve Wozniak PERSON
Cupertino GPE
Dhaka GPE


## Sentiment Analysis

In [23]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Sample text
text = "I absolutely hate this product! The experience was horrible."

# Get sentiment scores
sentiment = sia.polarity_scores(text)
print(sentiment)


{'neg': 0.565, 'neu': 0.435, 'pos': 0.0, 'compound': -0.831}


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


## Document Classification

In [26]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Load dataset
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)
newsgroups_train.data[0]

'From: sd345@city.ac.uk (Michael Collier)\nSubject: Converting images to HP LaserJet III?\nNntp-Posting-Host: hampton\nOrganization: The City University\nLines: 14\n\nDoes anyone know of a good way (standard PC application/PD utility) to\nconvert tif/img/tga files into LaserJet III format.  We would also like to\ndo the same, converting to HPGL (HP plotter) files.\n\nPlease email any response.\n\nIs this the correct group?\n\nThanks in advance.  Michael.\n-- \nMichael Collier (Programmer)                 The Computer Unit,\nEmail: M.P.Collier@uk.ac.city                The City University,\nTel: 071 477-8000 x3769                      London,\nFax: 071 477-8565                            EC1V 0HB.\n'

In [None]:

# Create a pipeline that attaches the TfidfVectorizer to a Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(newsgroups_train.data, newsgroups_train.target)

# Predict the categories of the test data
predicted_categories = model.predict(newsgroups_test.data)

# Evaluate the model
accuracy = (predicted_categories == newsgroups_test.target).mean()
print(f"Accuracy: {accuracy}")


Accuracy: 0.8348868175765646
