In [None]:
# 1. basic string operations
sentence = "Iowa State University, located in Ames, is a renowned public research university."
print("[example sentence]: " + sentence)
print()

# 1.1 convert to uppercase/lowercase
uppercase = sentence.upper()
lowercase = sentence.lower()
print("Uppercase:", uppercase)
print("Lowercase:", lowercase)
print()

In [None]:
# 1.2 split into words & join words
words = sentence.split()
print("Words in the sentence:", words)
print()

joined_sentence = " ".join(words)
print("Joined Sentence:", joined_sentence)
print()

In [None]:
# 1.3 find substrings & replace substrings
index = sentence.find("Ames") 
# returns the index of the first occurrence of the substring. 
# if the substring is not found, it returns -1
print(f"'Ames' found at index: {index}")
print()

modified_sentence = sentence.replace("Ames", "Ames, Iowa")
print("Modified Sentence:", modified_sentence)
print()

In [None]:
# 1.4 access characters by Index
first_char = sentence[0]
last_char = sentence[-1]
print(f"First Character: {first_char}")
print(f"Last Character: {last_char}")
print()

substring = sentence[0:21]  # "Iowa State University", the blank space also counts
print("Substring (0:21):", substring)
print()

In [None]:
# 2. NLTK. "The Natural Language Toolkit"
import nltk
nltk.download('punkt')  # ensure tokenizer resources are available

sentence = "Iowa State University, located in Ames, is a renowned public research university."

# 2.1 Tokenization
# Word
from nltk.tokenize import word_tokenize
tokens = word_tokenize(sentence)
print("Word Tokens:", tokens)
print()

# Sentence
from nltk.tokenize import sent_tokenize
long_introduction = "Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858. \
Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory. \
With over 36,000 students, ISU fosters innovation and global impact."
sentences = sent_tokenize(long_introduction)
print("Sentence Tokens:", sentences)
print()

In [None]:
# 2.2 Stop Words Removal & Frequency Distribution
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print("NLTK stopwords:", stop_words)
print()
filtered_words = [word for word in word_tokenize(sentence) if word.lower() not in stop_words]
print("Filtered Words (No Stop Words):", filtered_words)
print()

from nltk.probability import FreqDist
filtered_words = [word.lower() for word in filtered_words]
freq_dist = FreqDist(filtered_words)
print("Frequency Distribution:")
print(freq_dist.most_common(5))
print()

In [None]:
# 3. spaCy
sentence = "Iowa State University, located in Ames, is a renowned public research university."

import spacy
nlp = spacy.load("en_core_web_sm")  # load the small English model
# spaCy also comes with pre-trained models for multiple languages,
# allowing us to get started quickly with real-world text analysis
doc = nlp(sentence)

# 3.1 Tokenization
print("Tokens:")
for token in doc:
    print(token.text)
print()

In [None]:
# 3.2 Part-of-Speech (POS) Tagging
print("POS Tags:")
for token in doc:
    print(f"{token.text} -> {token.pos_} ({token.tag_})")
# token.pos_: The simplified part-of-speech tag (e.g., NOUN, VERB, etc.).
# token.tag_: The fine-grained POS tag, which provides more specific grammatical details (e.g., VBN for past participle verb).
print()

In [None]:
# 3.3 Named Entity Recognition (NER)
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_} ({spacy.explain(ent.label_)})")
print()

In [None]:
# 3.4 Dependency Parsing
# Focuses on relationships between words in a sentence.
# Represents sentences as a directed graph where words are nodes, and dependencies (like subject-verb, object-verb) are edges.
print("Dependency Parsing:")
for token in doc:
    print(f"{token.text} -> {token.dep_} (Head: {token.head.text})")
print()

# this is just a simple parser from spaCy
# there are many more advanced parsers, for example, you can try the demo: https://corenlp.run/
# parsing bridges the gap between raw text and its syntactic or semantic meaning, making it essential for advanced language understanding

In [None]:
# 3.5 Sentence Segmentation
long_introduction = "Iowa State University (ISU), in Ames, Iowa, is a top public research institution founded in 1858. \
Renowned for science, engineering, and agriculture, it hosts the U.S. Department of Energy’s Ames Laboratory. \
With over 36,000 students, ISU fosters innovation and global impact."
doc_long = nlp(long_introduction)

print("Sentences:")
for sent in doc_long.sents:
    print(sent.text)
print()

In [None]:
# 3.6 Lemmatization
# reduce a word to its base or root form (known as the "lemma")
print("Lemmatized Tokens:")
for token in doc:
    print(f"{token.text} -> {token.lemma_}")
print()

In [None]:
# 3.7 Similarity Between Words
word1 = nlp("research")
word2 = nlp("university")
similarity = word1.similarity(word2)
print(f"Similarity between '{word1}' and '{word2}': {similarity}")
print()

In [None]:
# 4. Regular Expression (Regex)
import re

# 4.1 Check if a Pattern Exists
pattern = r"Ames"
match = re.search(pattern, sentence)  # stops after finding the first match in the string
if match:
    print(f"Pattern '{pattern}' found at position: {match.start()}")
else:
    print(f"Pattern '{pattern}' not found")
print()

In [None]:
# 4.2 Find all Case-Insensitive Matching
pattern = r"university"
matches = re.findall(pattern, sentence, re.IGNORECASE)
print(f"Case-insensitive matches for '{pattern}':", matches)
print()

In [None]:
# 4.3 Split String Using a Pattern
pattern = r",|\."  # The pipe symbol "|" means "or" in regex; "\." matches the dot "."
parts = re.split(pattern, sentence)
print("Split Sentence:", parts)
print()

In [None]:
# 4.4 Validate Patterns (e.g., Email-Like Text)
test_string = "Contact me at qingwang@iastate.edu"

pattern = r"\b[A-Za-z0-9._-]+@[A-Za-z0-9._-]+\.[A-Za-z]{2,}\b"
# \b
# Matches a word boundary, ensuring the email address is a standalone word and not part of a larger string.
# This is used at both the start and end of the pattern.

# [A-Za-z0-9._-]+
# Matches the local part of the email address (before the @).
# Allows any combination of uppercase and lowercase letters (A-Za-z), digits (0-9), dots (.), underscores (_), and dashes (-).
# The "+" ensures there is at least one character.

# [A-Za-z]{2,}
# Matches the top-level domain (e.g., com, org, net).
# Accepts at least two characters ({2,}) and ensures they are only uppercase (A-Z) or lowercase (a-z) letters.

if re.search(pattern, test_string):
    print("Valid email found!")
else:
    print("No valid email found.")
print()

In [None]:
# 5. BERT (Bidirectional Encoder Representations from Transformers)
from transformers import BertTokenizer, BertModel
import torch

# 5.1 Prepare Input for BERT

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

sentence = "Iowa State University, located in Ames, is a renowned public research university."

# Tokenize the input sentence
tokens = tokenizer.tokenize(sentence)
print("Tokens:", tokens)

# Convert tokens to input IDs
input_ids = tokenizer.encode(sentence, add_special_tokens=True)
print("Input IDs:", input_ids)
print("Decoded Sentence:", tokenizer.decode(input_ids))  
# with special tokens [CLS] and [SEP] used for classification tasks and indicating sentence boundaries


# Convert to PyTorch tensors
inputs = tokenizer(sentence, return_tensors="pt", add_special_tokens=True)

print("Input Tensor Keys:", inputs.keys())  # 'input_ids' and 'attention_mask'
print("Input IDs Tensor:", inputs["input_ids"])
print("Attention Mask Tensor:", inputs["attention_mask"])
# the mask indicates which tokens should be attended to (1) and which should be ignored (0)
# In this example, all tokens have a mask value of 1, meaning all tokens should be attended to
print()

In [None]:
# 5.2 Get BERT Output

# Pass the input through BERT
outputs = model(**inputs)

# Outputs contain 'last_hidden_state' and 'pooler_output'
last_hidden_state = outputs.last_hidden_state
pooled_output = outputs.pooler_output

print("Last Hidden State Shape:", last_hidden_state.shape)  # (batch_size, seq_len, hidden_size)
print("Pooled Output Shape:", pooled_output.shape)  # (batch_size, hidden_size)
print()

# The pooler_output can be used as a fixed-size embedding for the sentence:
sentence_embedding = pooled_output.squeeze(0)  # Remove batch dimension
print("Sentence Embedding (768-dim):", sentence_embedding)
print()

In [None]:
# 5.3 Token-Level Embeddings

# Extract embeddings for each token
token_embeddings = last_hidden_state.squeeze(0)  # Remove batch dimension
print("Token Embeddings Shape:", token_embeddings.shape)  # (seq_len, hidden_size)

# Example: Embedding for the first token
print("First Token Embedding:", token_embeddings[0])
print()

In [None]:
# 5.4 Compute Sentence Similarity

# Encode two sentences and compute their similarity
sentence2 = "Ames is home to Iowa State University, a prominent research institution."
inputs2 = tokenizer(sentence2, return_tensors="pt", add_special_tokens=True)

# Get embeddings for both sentences
outputs1 = model(**inputs)
outputs2 = model(**inputs2)

embedding1 = outputs1.pooler_output
embedding2 = outputs2.pooler_output

# Compute cosine similarity
cosine_similarity = torch.nn.functional.cosine_similarity(embedding1, embedding2)
print("Cosine Similarity between sentences:", cosine_similarity.item())
print()