# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [36]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /workspaces/data_analytics/Week_11


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [37]:
# Defining documents (=sentenses)
d1 = 'This exercise is for the datanalytics course.'
d2 = 'This exercise is about NLP mehtods.'
d3 = 'This is an example text.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'This exercise is for the datanalytics course. This exercise is about NLP mehtods. This is an example text.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [38]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'this exercise is for the datanalytics course. this exercise is about nlp mehtods. this is an example text.'

### Removing punctuation

In [39]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'this exercise is for the datanalytics course this exercise is about nlp mehtods this is an example text'

### Tokenize text & removal of stopwords

In [40]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'below', 'against', 'ours', 'there', 'had', 'hasn', 'him', 'a', 'this', 've', 'that', 'them', 'which', 'some', 'such', "doesn't", 'his', 'did', 'don', 'is', 'of', 'all', 'by', "he'd", 'to', "he's", 'being', "it'd", 'yourself', 'should', 'where', 'here', 're', 'o', 'doing', "don't", 'or', "she's", 'having', 'as', 'itself', 'just', 'mightn', 'are', 's', "he'll", "it'll", 'who', "mightn't", 'myself', 'm', 'her', 'hadn', 'i', "couldn't", 'himself', 'an', 'down', 'own', 'under', 'than', 'my', 'these', 'needn', "haven't", "won't", 'what', 'after', 'shouldn', 'into', "wasn't", 'those', 'didn', 'isn', 'wasn', 'most', 'has', 'haven', "isn't", 'once', 'be', "should've", 'through', 'weren', 'd', 'each', 'not', 'and', 'can', 'during', "it's", 'ourselves', "they're", "that'll", 'their', "i'm", 'up', 'it', "aren't", "i'd", 'about', 'few', "they'll", 'over', 'you', 'more', 'when', 't', "i've", 'ain', 'doesn', 'very', "weren't", "shan't", 'ma', 'further', 'any', 'until', 's

In [41]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['exercise', 'datanalytics', 'course', 'exercise', 'nlp', 'mehtods', 'example', 'text']

### Lemmatization

In [42]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['exercise', 'datanalytics', 'course', 'exercise', 'nlp', 'mehtods', 'example', 'text'] 

After lemmatization:
['exercise', 'datanalytics', 'course', 'exercise', 'nlp', 'mehtods', 'example', 'text']

## Redefine the text corpus (pre-processed)

In [43]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['exercise datanalytics course', 
          'exercise nlp methods', 
          'exmaple text']

## Document-term matrix with ngram_range=(1,1)

In [53]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   course  datanalytics  exercise  exmaple  methods  nlp  text
0       1             1         1        0        0    0     0
1       0             0         1        0        1    1     0
2       0             0         0        1        0    0     1


## Document-term matrix with ngram_range=(2,2)

In [45]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   datanalytics course  exercise datanalytics  exercise nlp  exmaple text  \
0                    1                      1             0             0   
1                    0                      0             1             0   
2                    0                      0             0             1   

   nlp methods  
0            0  
1            1  
2            0  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [46]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 7 

The words in the corpus: 
 {'nlp', 'methods', 'text', 'exmaple', 'course', 'exercise', 'datanalytics'}

Term Frequency (TF):
      nlp  methods  text  exmaple  course  exercise  datanalytics
0  0.0000   0.0000   0.0      0.0  0.3333    0.3333        0.3333
1  0.3333   0.3333   0.0      0.0  0.0000    0.3333        0.0000
2  0.0000   0.0000   0.5      0.5  0.0000    0.0000        0.0000


### Inverse Document Frequency (IDF)

In [47]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
            nlp:     0.4771
        methods:     0.4771
           text:     0.4771
        exmaple:     0.4771
         course:     0.4771
       exercise:     0.1761
   datanalytics:     0.4771


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [48]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
     nlp  methods    text  exmaple  course  exercise  datanalytics
0  0.000    0.000  0.0000   0.0000   0.159    0.0587         0.159
1  0.159    0.159  0.0000   0.0000   0.000    0.0587         0.000
2  0.000    0.000  0.2386   0.2386   0.000    0.0000         0.000


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [52]:
text = '''In the section ‘Part-of-Speach (POS) tagging’, choose your own text example
and derive POS tags for this example. In the Jupyter notebook, briefly explain the
meaning of at least 5 POS-tags from the output of the POS tagging. Use the
following webpage for help:'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('In', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('section', 'NN', 'I-NP'),
 ('‘', 'VBZ', 'O'),
 ('Part-of-Speach', 'NNP', 'O'),
 ('(', '(', 'O'),
 ('POS', 'NNP', 'O'),
 (')', ')', 'O'),
 ('tagging', 'VBG', 'O'),
 ('’', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('choose', 'VB', 'O'),
 ('your', 'PRP$', 'O'),
 ('own', 'JJ', 'B-NP'),
 ('text', 'JJ', 'I-NP'),
 ('example', 'NN', 'I-NP'),
 ('and', 'CC', 'O'),
 ('derive', 'JJ', 'O'),
 ('POS', 'NNP', 'O'),
 ('tags', 'NN', 'B-NP'),
 ('for', 'IN', 'O'),
 ('this', 'DT', 'B-NP'),
 ('example', 'NN', 'I-NP'),
 ('.', '.', 'O'),
 ('In', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('Jupyter', 'NNP', 'O'),
 ('notebook', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('briefly', 'NN', 'B-NP'),
 ('explain', 'VBP', 'O'),
 ('the', 'DT', 'B-NP'),
 ('meaning', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('at', 'IN', 'O'),
 ('least', 'JJS', 'O'),
 ('5', 'CD', 'O'),
 ('POS-tags', 'JJ', 'O'),
 ('from', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('output', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),

### f)

#### Explanation of selected POS tags
##### NNP (Proper Noun, Singular)
Used for proper names or specific entities, e.g. “POS” or “Jupyter”.
##### NN (Noun, Singular or Mass)
Represents a general noun, such as “example”, “output”, or “notebook”.
##### VB (Verb, Base Form)
Indicates a verb in its base form, e.g. “choose” or “use”.
##### VBG (Verb, Gerund or Present Participle)
Describes a verb ending in -ing that functions as a verb or noun, e.g. “tagging”.
##### JJ (Adjective)
Modifies a noun by describing its quality or property, e.g. “own”, “following”, or “POS-tags”.
##### DT (Determiner)
Introduces a noun and limits its reference, e.g. “the” or “this”.
##### IN (Preposition or Subordinating Conjunction)
Shows relationships between words, e.g. “for”, “of”, or “from”.

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [50]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Linux | 6.8.0-1030-azure
Datetime: 2025-12-12 16:25:07
Python Version: 3.11.14
-----------------------------------
