In [22]:
# Text processing with regular expressions : Tokenization
import re
text = "I can't believe it's raining cats and dogs"
tokens = re.findall(r'\w+|/s+', text)
print(tokens)

['I', 'can', 't', 'believe', 'it', 's', 'raining', 'cats', 'and', 'dogs']


In [23]:
#Text processing with regular expressions : Lemmatization and Stemming
import nltk
from nltk.stem import WordNetLemmatizer
word = "raining"
wnl = WordNetLemmatizer()
print(wnl.lemmatize(word, pos= 'v'))



rain


In [24]:
# Text processing: Normalization by converting to lowercase
text = "Hello, World!"
normalized_text = text.lower()
print(normalized_text)


hello, world!


In [25]:
# Text processing: Normalization by Removing Punctuation
import string
text = "Hello, World!"
normalized_text = text.translate(str.maketrans("","",string.punctuation))
print(normalized_text)

Hello World


In [26]:
# Text processing: Normalization by handeling contractions
import contractions

text = "let's say,I can't believe it!"
normalized_text = contractions.fix(text)
print(normalized_text)

let us say,I cannot believe it!


In [27]:
#Text processing: Normalization by Removing stopwords
import nltk
from nltk.corpus import stopwords
# nltk.download('stopwords')

text = "This is a python code, It contains some stopwords"
stopwords_set = set(stopwords.words('english'))
normalized_text = ' '.join(word for word in text.split() if word.lower() not in stopwords_set)
print(normalized_text)

python code, contains stopwords


In [28]:
#Text processing: Normalization by Handeling Numeric Data
text = "I have 3 coins, 2 of 2 euros and 1 of 1 euro!"
normalized_text = re.sub(r'\d+', 'NUM' ,text )
print(normalized_text)

I have NUM coins, NUM of NUM euros and NUM of NUM euro!


In [31]:
#Text Processing : Minimal Editing Distance (med)
def min_edit_distance(s1, s2):
    m = len(s1)
    n = len(s2)

    # Initialize the matrix
    matrix = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column
    for i in range(m + 1):
        matrix[i][0] = i
    for j in range(n + 1):
        matrix[0][j] = j

    # Compute the minimal edit distance
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if s1[i - 1] == s2[j - 1]:
                matrix[i][j] = matrix[i - 1][j - 1]
            else:
                matrix[i][j] = min(
                    matrix[i - 1][j],  # Deletion
                    matrix[i][j - 1],  # Insertion
                    matrix[i - 1][j - 1]  # Substitution
                ) + 1

    return matrix[m][n]
A = "Intention"
B = "Execution"
print(min_edit_distance(A,B))

5


In [62]:
#Text Processing : N-gram language model
import nltk
from nltk import ngrams
from nltk.lm import MLE
from nltk.tokenize import word_tokenize

# Prepare corpus
corpus = "I love cats. Cats are cute. I love dogs too."
tokenized_corpus = [list(map(str.lower, word_tokenize(sent))) for sent in nltk.sent_tokenize(corpus)]

# # Generate trigrams
n = 3
# trigrams = list(ngrams(tokenized_corpus, n))

# Create the Train and Vocab object
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(n, tokenized_corpus)

# Build N-gram language model
ngram_model = MLE(n)
ngram_model.fit(train,vocab)

# Generate text using the language model
prefix = ["I", "love"]
generated_text = ngram_model.generate(n, prefix)
print(' '.join(generated_text))

dogs too .


In [72]:
#Text Processing NLP: Smoothing
#Add-k Smoothing (Laplace Smoothing):
import nltk
from nltk.util import ngrams
from nltk.lm import Laplace

# Prepare corpus
corpus = "I love cats. Cats are cute. I love dogs too."
tokenized_corpus = [list(map(str.lower, nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(corpus)]

# # Generate trigrams
n = 3

# Create the Train and Vocab object
from nltk.lm.preprocessing import padded_everygram_pipeline
train, vocab = padded_everygram_pipeline(n, tokenized_corpus)

# Build N-gram language model with add - k smoothing
ngram_model = Laplace(n)
ngram_model.fit(train,vocab)

# Calculate probability using the language model
context = ["i", "love"]
word = "cats"
probability = ngram_model.score(word, context)
print(probability)



0.15384615384615385
