# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [31]:
# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk

# Import only once
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

Current working directory: /Users/matthewjohnson/Documents/Studium/Module/DA/data_analytics/Week_11


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewjohnson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matthewjohnson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matthewjohnson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/matthewjohnson/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/matthewjohnson/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Defining documents

In [32]:
# Defining documents (=sentenses)
d1 = 'Unfortunately I support Chelsea Football Club.'
d2 = 'They make my weekends miserable.'
d3 = 'Nevertheless I keep the blue flag flying high.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'Unfortunately I support Chelsea Football Club. They make my weekends miserable. Nevertheless I keep the blue flag flying high.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization
- Removal of stop words
- Lemmatization

### Text to lowercase

In [33]:
# Text to lowercase function
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'unfortunately i support chelsea football club. they make my weekends miserable. nevertheless i keep the blue flag flying high.'

### Removing punctuation

In [34]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'unfortunately i support chelsea football club they make my weekends miserable nevertheless i keep the blue flag flying high'

### Tokenize text & removal of stopwords

In [35]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'their', 'should', 'to', 'such', 'through', 'is', 'why', 'about', "mustn't", 'they', "won't", 'yourself', 'has', 'here', 'some', 'down', 'then', "aren't", 'nor', 'again', 'any', 'his', 'while', 'my', "you've", 'or', 'having', "you'll", 'against', 'same', 'mustn', "shouldn't", 'if', 'in', 'the', 'up', 'own', "didn't", 'very', 'being', 'with', 'both', 'no', 'more', 'other', 'just', 'couldn', 'which', 'shouldn', 'our', "haven't", 'an', 'below', 'herself', 'were', 'be', 'this', 'hadn', 'hasn', "you're", 'her', 'between', 'all', 'now', 've', 'not', 'whom', 'yourselves', 'can', 'we', 'so', "doesn't", 'had', "isn't", 'do', 'than', 'will', "hasn't", 'he', 'a', "mightn't", 'for', "needn't", 'i', 'weren', 'did', 'himself', 'itself', "weren't", 'yours', "you'd", 'didn', 'your', 'him', "she's", 'm', 'by', 'are', 'but', 'ours', "don't", 'isn', 'these', 'that', 'was', 'and', 'where', 'ma', 'me', 're', "hadn't", 'needn', 'you', 'each', 'y', 'at', 'it', 'its', 'because', 't

In [36]:
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['unfortunately', 'support', 'chelsea', 'football', 'club', 'make', 'weekends', 'miserable', 'nevertheless', 'keep', 'blue', 'flag', 'flying', 'high']

### Lemmatization

In [37]:
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['unfortunately', 'support', 'chelsea', 'football', 'club', 'make', 'weekends', 'miserable', 'nevertheless', 'keep', 'blue', 'flag', 'flying', 'high'] 

After lemmatization:
['unfortunately', 'support', 'chelsea', 'football', 'club', 'make', 'weekend', 'miserable', 'nevertheless', 'keep', 'blue', 'flag', 'fly', 'high']

## Redefine the text corpus (pre-processed)

In [38]:
# We will use the lemmatized words above to re-define our corpus 
corpus = ['unfortunately support chelsea football club', 
          'make weekend miserable', 
          'nevertheless keep blue flag', 'fly', 'high']

## Document-term matrix with ngram_range=(1,1)

In [39]:
# Vectorizer with ngram_range=(1,1)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   blue  chelsea  club  flag  fly  football  high  keep  make  miserable  \
0     0        1     1     0    0         1     0     0     0          0   
1     0        0     0     0    0         0     0     0     1          1   
2     1        0     0     1    0         0     0     1     0          0   
3     0        0     0     0    1         0     0     0     0          0   
4     0        0     0     0    0         0     1     0     0          0   

   nevertheless  support  unfortunately  weekend  
0             0        1              1        0  
1             0        0              0        1  
2             1        0              0        0  
3             0        0              0        0  
4             0        0              0        0  


## Document-term matrix with ngram_range=(2,2)

In [40]:
# Vectorizer with with ngram_range=(2,2)
vectorizer = CountVectorizer(min_df=0.0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   blue flag  chelsea football  football club  keep blue  make weekend  \
0          0                 1              1          0             0   
1          0                 0              0          0             1   
2          1                 0              0          1             0   
3          0                 0              0          0             0   
4          0                 0              0          0             0   

   nevertheless keep  support chelsea  unfortunately support  \
0                  0                1                      1   
1                  0                0                      0   
2                  1                0                      0   
3                  0                0                      0   
4                  0                0                      0   

   weekend miserable  
0                  0  
1                  1  
2                  0  
3                  0  
4                  0  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [41]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))

Number of words in the corpus: 14 

The words in the corpus: 
 {'nevertheless', 'football', 'fly', 'chelsea', 'make', 'club', 'weekend', 'flag', 'miserable', 'blue', 'unfortunately', 'high', 'support', 'keep'}

Term Frequency (TF):
   nevertheless  football  fly  chelsea    make  club  weekend  flag  \
0          0.00       0.2  0.0      0.2  0.0000   0.2   0.0000  0.00   
1          0.00       0.0  0.0      0.0  0.3333   0.0   0.3333  0.00   
2          0.25       0.0  0.0      0.0  0.0000   0.0   0.0000  0.25   
3          0.00       0.0  1.0      0.0  0.0000   0.0   0.0000  0.00   
4          0.00       0.0  0.0      0.0  0.0000   0.0   0.0000  0.00   

   miserable  blue  unfortunately  high  support  keep  
0     0.0000  0.00            0.2   0.0      0.2  0.00  
1     0.3333  0.00            0.0   0.0      0.0  0.00  
2     0.0000  0.25            0.0   0.0      0.0  0.25  
3     0.0000  0.00            0.0   0.0      0.0  0.00  
4     0.0000  0.00            0.0   1.0      0.0  

### Inverse Document Frequency (IDF)

In [42]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
   nevertheless:      0.699
       football:      0.699
            fly:      0.699
        chelsea:      0.699
           make:      0.699
           club:      0.699
        weekend:      0.699
           flag:      0.699
      miserable:      0.699
           blue:      0.699
  unfortunately:      0.699
           high:      0.699
        support:      0.699
           keep:      0.699


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [43]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   nevertheless  football    fly  chelsea   make    club  weekend    flag  \
0        0.0000    0.1398  0.000   0.1398  0.000  0.1398    0.000  0.0000   
1        0.0000    0.0000  0.000   0.0000  0.233  0.0000    0.233  0.0000   
2        0.1748    0.0000  0.000   0.0000  0.000  0.0000    0.000  0.1748   
3        0.0000    0.0000  0.699   0.0000  0.000  0.0000    0.000  0.0000   
4        0.0000    0.0000  0.000   0.0000  0.000  0.0000    0.000  0.0000   

   miserable    blue  unfortunately   high  support    keep  
0      0.000  0.0000         0.1398  0.000   0.1398  0.0000  
1      0.233  0.0000         0.0000  0.000   0.0000  0.0000  
2      0.000  0.1748         0.0000  0.000   0.0000  0.1748  
3      0.000  0.0000         0.0000  0.000   0.0000  0.0000  
4      0.000  0.0000         0.0000  0.699   0.0000  0.0000  


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [44]:
text = '''Chelsea were accused of cheating in their football competition.
          But their owner found a loophole in the rules for the blues.
          Their miserable competitors begrudgingly agreed.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)

# Print the POS-tags
pprint(iob_tagged)

[('Chelsea', 'NN', 'B-NP'),
 ('were', 'VBD', 'O'),
 ('accused', 'VBN', 'O'),
 ('of', 'IN', 'O'),
 ('cheating', 'VBG', 'O'),
 ('in', 'IN', 'O'),
 ('their', 'PRP$', 'O'),
 ('football', 'NN', 'B-NP'),
 ('competition', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('But', 'CC', 'O'),
 ('their', 'PRP$', 'O'),
 ('owner', 'NN', 'B-NP'),
 ('found', 'VBD', 'O'),
 ('a', 'DT', 'B-NP'),
 ('loophole', 'NN', 'I-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('rules', 'NNS', 'O'),
 ('for', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('blues', 'NNS', 'O'),
 ('.', '.', 'O'),
 ('Their', 'PRP$', 'O'),
 ('miserable', 'JJ', 'O'),
 ('competitors', 'NNS', 'O'),
 ('begrudgingly', 'RB', 'O'),
 ('agreed', 'VBD', 'O'),
 ('.', '.', 'O')]


NN: Noun (person/place/thing/idea) - chelsea, football
VBG (Verb, gerund or present participle) - cheating
DT (Determiner) - a, the
RB (adverb) - a, the
VBD (Verb, past tense) - were, accused, found

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [45]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
POSIX
Darwin | 23.1.0
Datetime: 2023-12-15 02:06:47
Python Version: 3.10.13
-----------------------------------
