## Bigrams with and without `nltk`

In [66]:
import string
from collections import Counter
import re

stopwords = {"the", "of", "and", "is", "you", "do", "not", "about", "as", "at", "a", "on", "if", "this", "to", "your", "they", "have"}

text = """
The first rule of Fight Club: is: you don't not talk about Fight Club.
"""

text = text.lower()
text = re.sub(r'[^\w\s]', '', text)

tokens = text.split()

filtered_tokens = [word for word in tokens if word not in stopwords]

bigrams = list(zip(filtered_tokens, filtered_tokens[1:]))

bigram_counts = Counter(bigrams)

for bigram, count in bigram_counts.items():
    print(bigram, ":", count)

('first', 'rule') : 1
('rule', 'fight') : 1
('fight', 'club') : 2
('club', 'dont') : 1
('dont', 'talk') : 1
('talk', 'fight') : 1


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

text = """
The first rule of Fight Club is: you do not talk about Fight Club.
"""

text = text.lower()

tokens = word_tokenize(text)

filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
bigrams = list(zip(filtered_tokens, filtered_tokens[1:]))

bigram_counts = Counter(bigrams)

for bigram, count in bigram_counts.items():
    print(bigram, ":", count)

('first', 'rule') : 1
('rule', 'fight') : 1
('fight', 'club') : 2
('club', 'talk') : 1
('talk', 'fight') : 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Tokenization, Sentence Splitting, Normalization

In [3]:
import re

def tokenize_text(text):
    tokens = re.findall(r"[A-Za-z'-]+", text)

    filtered_tokens = [token.lower() for token in tokens if len(token) >= 2]

    return filtered_tokens


text = "Can't we just use well-known NLP tools? They're awesome, aren't they?"
tokens = tokenize_text(text)
print(tokens)


["can't", 'we', 'just', 'use', 'well-known', 'nlp', 'tools', "they're", 'awesome', "aren't", 'they']


In [17]:
import re

def normalize_text_simple(text):
    # lowercase
    text = text.lower()

    # replace numbers (integers or decimals) with NUMBER
    text = re.sub(r'\d+(\.\d+)?', 'NUMBER', text)

    # replace multiple spaces/tabs with single space
    text = re.sub(r'\s+', ' ', text)

    # simple lemmatization: remove 'ing' or 'ed' endings
    words = text.split()
    for i in range(len(words)):
        if words[i].endswith('ing') and len(words[i]) > 5:
            words[i] = words[i][:-3]
        elif words[i].endswith('ed') and len(words[i]) > 4:
            words[i] = words[i][:-2]

    # join back into a string
    return ' '.join(words)

sample_input = "Running and jumping 123 times is fun. I walked 45.67 miles."
print(normalize_text_simple(sample_input))


runn and jump NUMBER times is fun. i walk NUMBER miles.


In [75]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Add this line to download the missing resource

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Sentence tokenize
    sentences = sent_tokenize(text)
    result = []
    for sent in sentences:
        # Word tokenize keeps words, hyphens, apostrophes
        tokens = re.findall(r"\w[\w'-]*", sent)
        normalized = normalizeTokens(tokens)
        result.append((sent, tokens, normalized))
    return result

def normalizeTokens(tokens):
    normalized = []
    for t in tokens:
        t_lower = t.lower()
        # replace numbers
        if re.fullmatch(r'\d+(\.\d+)?', t_lower):
            normalized.append("NUMBER")
        # remove stopwords and short words
        elif t_lower not in stop_words and len(t_lower) >= 2:
            # simple lemmatization
            if t_lower.endswith('ing') and len(t_lower) > 5:
                t_lower = t_lower[:-3]
            elif t_lower.endswith('ed') and len(t_lower) > 4:
                t_lower = t_lower[:-2]
            normalized.append(t_lower)
    return normalized



sample_text = "Dr. Brown can't run 123 miles. Isn't NLP fun? He scored 98.6 points."
processed = preprocess_text(sample_text)

for i, (sent, tokens, normalized) in enumerate(processed, 1):
    print(f"Sentence {i}: {sent}")
    print(f"  Tokens: {tokens}")
    print(f"  Normalized: {normalized}\n")

Sentence 1: Dr. Brown can't run 123 miles.
  Tokens: ['Dr', 'Brown', "can't", 'run', '123', 'miles']
  Normalized: ['dr', 'brown', "can't", 'run', 'NUMBER', 'miles']

Sentence 2: Isn't NLP fun?
  Tokens: ["Isn't", 'NLP', 'fun']
  Normalized: ['nlp', 'fun']

Sentence 3: He scored 98.6 points.
  Tokens: ['He', 'scored', '98', '6', 'points']
  Normalized: ['scor', 'NUMBER', 'NUMBER', 'points']



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
import re

text = "Dr. Smith lives in the U.S.A. He loves NLP. The value of pi is 3.14. Isn't it cool?"

abbreviations = ["Mr.", "Mrs.", "Ms.", "Dr.", "U.S.A", "e.g.", "i.e.", "vs.", "etc."]

for abbr in abbreviations:
    text = text.replace(abbr, abbr.replace(".", "<prd>"))

text = re.sub(r'(\d)\.(\d)', r'\1<prd>\2', text)

# mark sentence boundaries
text = text.replace("! ", "!<stop>")
text = text.replace("? ", "?<stop>")
text = text.replace(". ", ".<stop>")

# restore periods
text = text.replace("<prd>", ".")

# split into sentences and clean
sentences = text.split("<stop>")
sentences = [s.strip() for s in sentences if s.strip()]



for i, s in enumerate(sentences, 1):
    print(f"{i}: {s}")


1: Dr. Smith lives in the U.S.A.
2: He loves NLP.
3: The value of pi is 3.14.
4: Isn't it cool?


In [77]:
import re

def sentence_segmentation(text):
    # Known abbreviations
    abbreviations = [
    "u.s.a", "u.s.a.", "x.ai", "Sr.", "Srta.", "eth", "ph.d", "phd", "ai",
    "Mr.", "Mrs.", "Ms.", "Dr.", "U.S.A.", "e.g.", "i.e.", "vs.", "etc."
]


    # Step 1. Protect abbreviations
    for abbr in abbreviations:
      text = text.replace(abbr, abbr.replace(".", "<prd>"))


    # Step 3. Protect decimals and websites
    text = re.sub(r"(\d)\.(\d)", r"\1<prd>\2", text)
    text = re.sub(r"([A-Za-z])\.([A-Za-z])\.([A-Za-z])", r"\1<prd>\2<prd>\3", text)


    # Step 4. Mark sentence boundaries
    text = text.replace("! ", "!<stop>")
    text = text.replace("? ", "?<stop>")
    text = text.replace(". ", ".<stop>")
    text = text.replace('..."', '...".<stop>')
    text = text.replace('." ', '.".<stop>')

    # Step 5. Restore protected tokens
    text = text.replace("<prd>", ".")
    text = text.replace("<ellip>", "...")

    # Step 6. Split into sentences
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]

    # Debug / output
    for i, s in enumerate(sentences, 1):
        print(f"Sentence {i}: {s}")



# Example
#text = "Dr. Smith lives in the U.S.A. He loves NLP... Do you? Visit www.ai.cn for more."
text = """AI is reshaping industries globally. Sr. Carlos Méndez works at x.ai. He says: "NLP is tough..." His team in U.S.A. develops models for multilingual texts. For example, their accuracy is 95.6%. However, challenges persist. Consider "bank"—is it a riverbank or a financial institution? In Beijing, **中国科学院** tackles similar issues. Their website, www.ai.cn, documents progress. Dr. Li Wei notes: "We need better algorithms." Meanwhile, en Madrid, Srta. Ana Ruiz leads a project on Español NLP. Her team's budget? $2.3M. They collaborate with ETH Zürich, per x.ai's guidelines. Isn't that cool? Ambiguities like "lead" (metal or guide) complicate things... Moreover, slang like "lol" confuses models. In **中国**, Weibo posts use "666" for praise. Such nuances demand advanced processing. xAI's Grok aims to solve this. Visit x.ai for details. Their team hopes to launch by 2026. Processing costs are high—3.14x last year's budget. AI's future is bright!"""

sentence_segmentation(text)


Sentence 1: AI is reshaping industries globally.
Sentence 2: Sr. Carlos Méndez works at x.ai.
Sentence 3: He says: "NLP is tough...".
Sentence 4: His team in U.S.A. develops models for multilingual texts.
Sentence 5: For example, their accuracy is 95.6%.
Sentence 6: However, challenges persist.
Sentence 7: Consider "bank"—is it a riverbank or a financial institution?
Sentence 8: In Beijing, **中国科学院** tackles similar issues.
Sentence 9: Their website, www.ai.cn, documents progress.
Sentence 10: Dr. Li Wei notes: "We need better algorithms.".
Sentence 11: Meanwhile, en Madrid, Srta. Ana Ruiz leads a project on Español NLP.
Sentence 12: Her team's budget?
Sentence 13: $2.3M.
Sentence 14: They collaborate with ETH Zürich, per x.ai's guidelines.
Sentence 15: Isn't that cool?
Sentence 16: Ambiguities like "lead" (metal or guide) complicate things...
Sentence 17: Moreover, slang like "lol" confuses models.
Sentence 18: In **中国**, Weibo posts use "666" for praise.
Sentence 19: Such nuances dem

##### Sentence Segmentation

## Minimum Edit Distance
with and without nltk

In [26]:
def edit_distance(str1, str2):
    m, n = len(str1), len(str2)

    # Create a DP table
    dp = [[0] * (n + 1) for i in range(m + 1)]

    # Fill the base cases
    for i in range(m + 1):
        dp[i][0] = i   # cost of deleting all characters
    for j in range(n + 1):
        dp[0][j] = j   # cost of inserting all characters

    # Fill the DP table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i-1] == str2[j-1]:
                dp[i][j] = dp[i-1][j-1]  # no operation needed
            else:
                dp[i][j] = 1 + min(
                    dp[i-1][j],    # deletion
                    dp[i][j-1],    # insertion
                    dp[i-1][j-1]   # substitution
                )

    return dp[m][n]

# Example usage
print(edit_distance("kitten", "sitting"))  # Output: 3
print(edit_distance("sunday", "saturday"))  # Output: 3


3
3


In [49]:
import nltk
from nltk.metrics import edit_distance

# Example words
word1 = "kitten"
word2 = "sitting"

# Compute edit distance
distance = edit_distance(word1, word2)
print(distance)

3


## Morphological Analysis

In [None]:
import re

prefixes = ["un", "re", "dis", "in", "mis", "pre", "non", "over", "de", "anti"]
suffixes = ["ing", "ed", "ness", "ment", "ify", "s", "ly", "al", "er", "ion", "able"]

words = [
    "unhappiness", "replayed", "dislike", "running", "government", "inactive",
    "simplify", "cats", "misleading", "previewed", "nonfiction", "overcooked",
    "deactivated", "antivirus", "reorganize", "unfriendly", "walked", "happiness",
    "employment", "readable", "national", "teachers", "disapproval", "prepayment",
    "informer"
]

def analyze_word(word):
    root = word
    pre = ""
    suf = ""

    for p in prefixes:
        if word.startswith(p):
            pre = p
            root = root[len(p):]
            break

    for s in suffixes:
        if root.endswith(s):
            suf = s
            root = root[: -len(s)]
            break

    return (pre, root, suf)

for w in words:
    print(f"{w} -> {analyze_word(w)}")

## Inflectional and derivational

In [29]:
# Predefined Affix Lists
PREFIXES = sorted([
    "un", "re", "in", "im", "dis", "mis", "pre", "non",
    "over", "under", "inter", "trans", "sub", "super", "anti", "auto"
], key=len, reverse=True)

INFLECTIONAL_SUFFIXES = ["s", "es", "ed", "ing", "er", "est"]
DERIVATIONAL_SUFFIXES = [
    "ness", "less", "ful", "able", "ible", "ment", "tion", "sion",
    "al", "ly", "ous", "ive", "ize", "ship", "hood", "ist", "ism", "y", "er"
]
SUFFIXES = sorted(INFLECTIONAL_SUFFIXES + DERIVATIONAL_SUFFIXES, key=len, reverse=True)

def analyze_morphemes_simple(word):
    original_word = word
    prefixes = []
    suffixes = []

    # Remove prefixes iteratively
    for p in PREFIXES:
        if word.startswith(p) and len(word) > len(p) + 1:
            prefixes.append(p)
            word = word[len(p):]
            break  # only take the first longest match

    # Remove suffixes iteratively
    while True:
        matched = False
        for s in SUFFIXES:
            if word.endswith(s) and len(word) > len(s) + 1:
                suffixes.append(s)
                word = word[:-len(s)]
                matched = True
                break
        if not matched:
            break

    # Determine type
    affix_types = set()
    for s in suffixes:
        if s in INFLECTIONAL_SUFFIXES:
            affix_types.add("inflectional")
        if s in DERIVATIONAL_SUFFIXES:
            affix_types.add("derivational")
    for p in prefixes:
        affix_types.add("derivational")

    if not affix_types:
        affix_type = "none"
    elif len(affix_types) == 1:
        affix_type = affix_types.pop()
    else:
        affix_type = "both"

    return {
        "root": word,
        "prefixes": prefixes,
        "suffixes": suffixes,
        "type": affix_type
    }

words = ["unhappiness", "running", "bookshelf", "uncharacteristically",
  "reestablishing", "firefighter"]
for w in words:
  print(w, "->", analyze_morphemes_simple(w))


unhappiness -> {'root': 'happi', 'prefixes': ['un'], 'suffixes': ['ness'], 'type': 'derivational'}
running -> {'root': 'runn', 'prefixes': [], 'suffixes': ['ing'], 'type': 'inflectional'}
bookshelf -> {'root': 'bookshelf', 'prefixes': [], 'suffixes': [], 'type': 'none'}
uncharacteristically -> {'root': 'characteristic', 'prefixes': ['un'], 'suffixes': ['ly', 'al'], 'type': 'derivational'}
reestablishing -> {'root': 'establish', 'prefixes': ['re'], 'suffixes': ['ing'], 'type': 'both'}
firefighter -> {'root': 'firefight', 'prefixes': [], 'suffixes': ['er'], 'type': 'both'}


## Corpora Analysis

Spell Checker

In [67]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.metrics import edit_distance

nltk.download('punkt')

def simple_spell_checker(text, dictionary):
    tokens = word_tokenize(text)
    misspelled = []

    # find words not in dictionary
    for word in tokens:
        if word.isalpha() and word.lower() not in dictionary:
            misspelled.append(word)

    # give suggestions
    for word in set(misspelled):
        distances = [(w, edit_distance(word.lower(), w)) for w in dictionary]
        distances.sort(key=lambda x: x[1])   # sort by edit distance
        suggestions = [w for w, d in distances[:3]]  # top 3
        print(f"{word} -> {suggestions}")

# Example
dictionary = {"artificial", "intelligence", "companies", "develops", "precision",
              "researchers", "optimistic", "communication", "accuracy"}

text = "Artifical inteligence is powerfull but it requir precisin."
simple_spell_checker(text, dictionary)


powerfull -> ['develops', 'companies', 'accuracy']
Artifical -> ['artificial', 'accuracy', 'precision']
but -> ['accuracy', 'develops', 'artificial']
is -> ['develops', 'companies', 'precision']
inteligence -> ['intelligence', 'optimistic', 'artificial']
precisin -> ['precision', 'optimistic', 'artificial']
it -> ['develops', 'companies', 'accuracy']
requir -> ['precision', 'develops', 'accuracy']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## POS Tagging

In [69]:
import nltk
from collections import Counter
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger_eng") # Add this line to download the missing resource


text = "The cat chases the dog. Dogs are friendly. Children play with hats."

tokens = word_tokenize(text)
tagged = pos_tag(tokens)

lemmatizer = WordNetLemmatizer()

# Count singular/plural nouns
counts = Counter()
for word, tag in tagged:
    if tag in ("NN", "NNS"):
        lemma = lemmatizer.lemmatize(word.lower(), "n")
        form = "singular" if tag == "NN" else "plural"
        counts[(lemma, form)] += 1

# Print
for (lemma, form), c in counts.items():
    print(f"{lemma},{form},{c}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


cat,singular,1
dog,singular,1
dog,plural,1
play,singular,1
hat,plural,1


In [71]:
import nltk
from nltk import word_tokenize, pos_tag, sent_tokenize

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

text = """
You must go to the store.
The project must meet deadlines.
She must be happy.
We must not delay.
This must end now.
"""

sentences = sent_tokenize(text)

for sent in sentences:
    tagged = pos_tag(word_tokenize(sent))
    for i, (word, tag) in enumerate(tagged):
        if word.lower() == "must" and i+1 < len(tagged):
            print(f"{word} + {tagged[i+1]}")


must + ('go', 'VB')
must + ('meet', 'VB')
must + ('be', 'VB')
must + ('not', 'RB')
must + ('end', 'VB')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
