# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [52]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [53]:
# Defining documents (=sentenses)
d1 = "A cat is sitting on the window."
d2 = "The dog is running in the park."
d3 = "The birds are flying over the trees."

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'A cat is sitting on the window. The dog is running in the park. The birds are flying over the trees.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [54]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'a cat is sitting on the window. the dog is running in the park. the birds are flying over the trees.'

### Removing punctuation

In [55]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'a cat is sitting on the window the dog is running in the park the birds are flying over the trees'

### Tokenize text & removal of stopwords

In [56]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'mightn', 'wouldn', 'such', 'yourselves', 'has', 'too', 'during', 'over', 'the', 'i', 'that', 't', "hasn't", 'she', 'below', 'won', 'but', 'with', 'they', 'up', 'having', "mustn't", 'by', 'weren', 'yours', 'again', "shan't", 'its', "won't", 'mustn', 'are', 'until', 'only', 'once', 'myself', 'ain', 'hadn', 'don', 'then', 'where', "she's", "didn't", 'll', 'through', "needn't", "don't", 'it', 'if', 'no', 'what', 'his', 're', 'few', 'or', 'and', 'being', 'when', "hadn't", 'me', 'how', 'all', "you've", 'before', 'been', "aren't", 'isn', "you're", 'while', 'which', 'didn', 'couldn', 'm', 'hers', "wouldn't", 'between', 'off', 'them', 'there', 'now', 'after', "you'd", 'nor', 'ours', 'same', 'who', 'herself', 'at', 'into', 'here', 'than', 'this', 'will', 'for', 'her', 'do', "doesn't", 'own', 'doesn', 'most', 'y', "isn't", 'from', 'should', 'our', 'of', 'my', 'any', 'under', 'd', 've', 'have', 'be', 'down', 'aren', 'shan', 'each', 'was', 's', 'you', "wasn't", 'other',

In [57]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['cat', 'sitting', 'window', 'dog', 'running', 'park', 'birds', 'flying', 'trees']

### Lemmatization

In [58]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['cat', 'sitting', 'window', 'dog', 'running', 'park', 'birds', 'flying', 'trees'] 

After lemmatization:
['cat', 'sit', 'window', 'dog', 'run', 'park', 'bird', 'fly', 'tree']

## Redefine the text corpus (pre-processed)

In [59]:
# We will use the lemmatized words above to re-define our corpus 
corpus = [
    'cat sit window',      # From d1: "A cat is sitting on the window."
    'dog run park',        # From d2: "The dog is running in the park."
    'bird fly tree'        # From d3: "The birds are flying over the trees."
]

## Document-term matrix with ngram_range=(1,1)

In [60]:
# Import required libraries
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the corpus
corpus = [
    'cat sit window', 
    'dog run park', 
    'bird fly tree'
]

# Document-term matrix with ngram_range=(1,1) [Unigrams]
vectorizer_unigram = CountVectorizer(ngram_range=(1,1), min_df=0.0)

# Transform the corpus
count_unigram = vectorizer_unigram.fit_transform(corpus)

# Create a dataframe for unigrams
df_unigram = pd.DataFrame(count_unigram.toarray(),
                          columns=vectorizer_unigram.get_feature_names_out())

print('Document-term matrix (Unigrams)')
print(df_unigram)

# Document-term matrix with ngram_range=(2,2) [Bigrams]
vectorizer_bigram = CountVectorizer(ngram_range=(2,2), min_df=0.0)

# Transform the corpus
count_bigram = vectorizer_bigram.fit_transform(corpus)

# Create a dataframe for bigrams
df_bigram = pd.DataFrame(count_bigram.toarray(),
                         columns=vectorizer_bigram.get_feature_names_out())

print('\nDocument-term matrix (Bigrams)')
print(df_bigram)


Document-term matrix (Unigrams)
   bird  cat  dog  fly  park  run  sit  tree  window
0     0    1    0    0     0    0    1     0       1
1     0    0    1    0     1    1    0     0       0
2     1    0    0    1     0    0    0     1       0

Document-term matrix (Bigrams)
   bird fly  cat sit  dog run  fly tree  run park  sit window
0         0        1        0         0         0           1
1         0        0        1         0         1           0
2         1        0        0         1         0           0


## Document-term matrix with ngram_range=(2,2)

In [61]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   bird fly  cat sit  dog run  fly tree  run park  sit window
0         0        1        0         0         0           1
1         0        0        1         0         1           0
2         1        0        0         1         0           0


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [62]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 9 

The words in the corpus: 
 {'dog', 'tree', 'window', 'park', 'bird', 'sit', 'run', 'fly', 'cat'}

Term Frequency (TF):
      dog    tree  window    park    bird     sit     run     fly     cat
0  0.0000  0.0000  0.3333  0.0000  0.0000  0.3333  0.0000  0.0000  0.3333
1  0.3333  0.0000  0.0000  0.3333  0.0000  0.0000  0.3333  0.0000  0.0000
2  0.0000  0.3333  0.0000  0.0000  0.3333  0.0000  0.0000  0.3333  0.0000


### Inverse Document Frequency (IDF)

In [63]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
            dog:     0.4771
           tree:     0.4771
         window:     0.4771
           park:     0.4771
           bird:     0.4771
            sit:     0.4771
            run:     0.4771
            fly:     0.4771
            cat:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [64]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
     dog   tree  window   park   bird    sit    run    fly    cat
0  0.000  0.000   0.159  0.000  0.000  0.159  0.000  0.000  0.159
1  0.159  0.000   0.000  0.159  0.000  0.000  0.159  0.000  0.000
2  0.000  0.159   0.000  0.000  0.159  0.000  0.000  0.159  0.000


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [65]:
text = '''The United Nations warned that climate change is a severe global crisis. 
          Many countries are working together to reduce carbon emissions and protect the environment..'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('The', 'DT', 'O'),
 ('United', 'NNP', 'O'),
 ('Nations', 'NNP', 'O'),
 ('warned', 'VBD', 'O'),
 ('that', 'IN', 'O'),
 ('climate', 'NN', 'B-NP'),
 ('change', 'NN', 'B-NP'),
 ('is', 'VBZ', 'O'),
 ('a', 'DT', 'B-NP'),
 ('severe', 'JJ', 'I-NP'),
 ('global', 'JJ', 'I-NP'),
 ('crisis', 'NN', 'I-NP'),
 ('.', '.', 'O'),
 ('Many', 'JJ', 'O'),
 ('countries', 'NNS', 'O'),
 ('are', 'VBP', 'O'),
 ('working', 'VBG', 'O'),
 ('together', 'RB', 'O'),
 ('to', 'TO', 'O'),
 ('reduce', 'VB', 'O'),
 ('carbon', 'NN', 'B-NP'),
 ('emissions', 'NNS', 'O'),
 ('and', 'CC', 'O'),
 ('protect', 'VBP', 'O'),
 ('the', 'DT', 'B-NP'),
 ('environment', 'NN', 'I-NP'),
 ('..', 'NN', 'B-NP')]


- DT (Determiner):
Examples: "The", "a", "the"
- NN (Noun, singular or mass):
Examples: "climate", "change", "crisis", "environment"
- NNP (Proper noun, singular):
Examples: "United", "Nations"
- VBD (Verb, past tense):
Example: "warned"
- JJ (Adjective):
Examples: "severe", "global", "Many"

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [66]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.5.0-1025-azure
Datetime: 2024-12-15 21:58:22
Python Version: 3.11.10
-----------------------------------
