### Natural language processing

In [None]:
#%pip install transformers torch spacy blis

In [None]:
%pip install nltk gensim spacy

^C
Note: you may need to restart the kernel to use updated packages.


In [None]:
text = "Please add eggs, milk, and bread to my shopping list."
text

#### Segmentation

In [None]:
# import 
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize

print(nltk.data.path)

In [None]:
# Split text into sentences
sentences = sent_tokenize(text)
sentences

In [None]:
sentences[0]

In [None]:
# Punctuation removal
import re

# Remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]", " ", sentences[0]) 
text

#### Tokenization

In [None]:
from nltk.tokenize import word_tokenize

In [None]:
words = word_tokenize(text)
print(words)

#### Removal of stop words

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
# Remove stop words
words = [w for w in words if w not in stopwords.words("english")]
print(words)

In [None]:
# have a look at the stop words in nltk's corpus
print(stopwords.words("spanish"))

#### Stemming and lemmatization

In [None]:
nltk.download('wordnet') # download for lemmatization
nltk.download('omw-1.4')

In [None]:
# Stemming
from nltk.stem.porter import PorterStemmer

# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

In [None]:
# Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

In [None]:
# Another stemming and lemmatization example
words2 = ['wait', 'waiting' , 'studies', 'studying', 'computers']

# Stemming
# Reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words2]
print("Stemming output: {}".format(stemmed))

# Lemmatization
# Reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words2]
print("Lemmatization output: {}".format(lemmatized))

#### Part of speech tagging

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')

In [None]:
from nltk import pos_tag

In [None]:
# tag each word with part of speech
pos_tag(words)

In [None]:
"""
POS

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional

"""

#### Embeddings 

In [22]:
from gensim.models import Word2Vec

# Tokenize the sentence
tokens = word_tokenize(text.lower())

# Train a Word2Vec model (normally trained on a large corpus; here for demo purposes)
model = Word2Vec([tokens], vector_size=100, window=5, min_count=1, workers=4)

word_embedding = model.wv['eggs']
print("Embedding for 'eggs':", word_embedding)

Embedding for 'eggs': [ 8.1681199e-03 -4.4430327e-03  8.9854337e-03  8.2536647e-03
 -4.4352221e-03  3.0310510e-04  4.2744912e-03 -3.9263200e-03
 -5.5599655e-03 -6.5123225e-03 -6.7073823e-04 -2.9592158e-04
  4.4630850e-03 -2.4740540e-03 -1.7260908e-04  2.4618758e-03
  4.8675989e-03 -3.0808449e-05 -6.3394094e-03 -9.2608072e-03
  2.6657581e-05  6.6618943e-03  1.4660227e-03 -8.9665223e-03
 -7.9386048e-03  6.5519023e-03 -3.7856805e-03  6.2549924e-03
 -6.6810320e-03  8.4796622e-03 -6.5163244e-03  3.2880199e-03
 -1.0569858e-03 -6.7875278e-03 -3.2875966e-03 -1.1614120e-03
 -5.4709399e-03 -1.2113475e-03 -7.5633135e-03  2.6466595e-03
  9.0701487e-03 -2.3772502e-03 -9.7651005e-04  3.5135616e-03
  8.6650876e-03 -5.9218528e-03 -6.8875779e-03 -2.9329848e-03
  9.1476962e-03  8.6626766e-04 -8.6784009e-03 -1.4469790e-03
  9.4794659e-03 -7.5494875e-03 -5.3580985e-03  9.3165627e-03
 -8.9737261e-03  3.8259076e-03  6.6544057e-04  6.6607012e-03
  8.3127534e-03 -2.8507852e-03 -3.9923131e-03  8.8979173e-03
  

#### Contextual Embeddings

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the text
text = "Climate change affects coral reefs."
inputs = tokenizer(text, return_tensors="pt")

# Pass the inputs through BERT to get embeddings
outputs = model(**inputs)

# Extract the embeddings for each token
embeddings = outputs.last_hidden_state  # Shape: [1, sequence_length, hidden_size]
print("Shape of embeddings:", embeddings.shape)

# Get the embedding for the word "coral"
token_id = tokenizer.convert_tokens_to_ids("coral")
coral_embedding = embeddings[0, inputs['input_ids'][0].tolist().index(token_id)]
print("Embedding for 'coral':", coral_embedding)

Collecting transformers
  Downloading transformers-4.46.1-py3-none-any.whl.metadata (44 kB)
Collecting torch
  Downloading torch-2.5.1-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting numpy>=1.17 (from transformers)
  Downloading numpy-2.1.3-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting packaging>=20.0 (from transformers)
  Downloading packaging-24.1-py3-none-any.whl.metadata (3.2 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.9.11-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting safetensor


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: C:\Users\Admin\AppData\Local\Programs\Python\Python312\python.exe -m pip install --upgrade pip


ModuleNotFoundError: No module named 'transformers'

#### Named entity recognition

In [29]:
import spacy

# Load the pre-trained SpaCy model
nlp = spacy.load("en_core_web_sm")

# Define your text
text = "Apple is looking at buying a startup in San Francisco for $1 billion."

# Process the text with the SpaCy model
doc = nlp(text)

# Extract and display named entities
for ent in doc.ents:
    print(ent.text, ent.label_)


ValueError: BLIS support requires blis: pip install blis