# A primer in text mining

Lehrende: [Markus Stricker](mailto:markus.stricker@rub.de)

Inspired by [datacamp 'Python Bag of Words Models: A Complete Guide'](https://www.datacamp.com/tutorial/python-bag-of-words-model)


## Text preparation: Tokenization & stopword-deletion

_Stopwords_ are frequently occuring word without inherent meaning. They are usually deleted after tokenization.

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download der Standardlisten (muss explizit erfolgen)
nltk.download('punkt_tab') # Interpunktion
nltk.download('stopwords') # Stopwords

[nltk_data] Downloading package punkt_tab to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
sentence = "This is an example showing how to remove stop words from a sentence."

In [3]:
words = word_tokenize(sentence)
print(words)

['This', 'is', 'an', 'example', 'showing', 'how', 'to', 'remove', 'stop', 'words', 'from', 'a', 'sentence', '.']


In [4]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'too', "hadn't", 'any', 'not', 'did', 'was', 'there', "he'd", 'don', 'ours', 'than', 'they', 'am', "we'd", 'a', "mightn't", 'hasn', 'to', 'couldn', "they'll", 'at', 'only', 'them', "it's", 'or', 'under', "shan't", 'me', 'hadn', 'as', "it'll", 'during', 'but', 'why', 'you', "shouldn't", 'because', 'between', 'same', 'off', "they've", 'have', 'were', 'more', 'd', 'after', 'so', 'wouldn', 'aren', "it'd", "didn't", 'other', 'themselves', 'now', 'won', 'no', 'having', 'below', 'that', "she'll", "that'll", 'the', "we've", 'just', 'with', 'our', 'this', "wasn't", 'some', 'shouldn', 'weren', 'ma', 'against', 'do', "he'll", 'into', 'before', 'he', 'himself', 'ourselves', "they'd", 'your', 'by', 'through', 'most', 're', 'mustn', 'it', 'isn', 'my', 'from', "you'll", "you're", 'on', 'own', 'yours', 't', 'has', "wouldn't", 'and', 'theirs', 'yourself', 'does', 'those', 'where', 'itself', 'wasn', 'mightn', "isn't", "i've", "they're", 'being', 'didn', 'nor', 'been', "don't", 'ain', 'myself', 'these',

In [5]:
filtered_sentence = [word for word in words if word.lower() not in stop_words]
print(filtered_sentence)

['example', 'showing', 'remove', 'stop', 'words', 'sentence', '.']


In [6]:
import re  
# Import the regular expressions module to help with text processing
from collections import (
    defaultdict,
)  

# Import defaultdict to easily handle word frequency counting

# Sample corpus of text - a small dataset of sentences to analyze
corpus = [
    "Tokenization is the process of breaking text into words.",
    "Vocabulary is the collection of unique words.",
    "The process of tokenizing is essential in NLP.",
]

# Initialize a defaultdict with integer values to store word frequencies

# defaultdict(int) initializes each new key with a default integer value of 0
vocab = defaultdict(int)

# Loop through each sentence in the corpus to tokenize and normalize
for sentence in corpus:
    # Convert the sentence to lowercase to ensure consistency in counting (e.g., 'Tokenization' and 'tokenization' are treated as the same word)
    # Use regular expressions to find words composed of alphanumeric characters only
    words = re.findall(r"\b\w+\b", sentence.lower())
    # For each word found, increment its count in the vocab dictionary
    for word in words:
        vocab[word] += 1

# Convert the defaultdict vocab to a regular dictionary for easier handling and sorting

# Sort the dictionary by word frequency in descending order and convert it to a new dictionary
sorted_vocab = dict(sorted(vocab.items(), key=lambda x: x[1], reverse=True))

# Display the sorted vocabulary with each word and its frequency count
print("Vocabulary with frequencies:", sorted_vocab)

Vocabulary with frequencies: {'is': 3, 'the': 3, 'of': 3, 'process': 2, 'words': 2, 'tokenization': 1, 'breaking': 1, 'text': 1, 'into': 1, 'vocabulary': 1, 'collection': 1, 'unique': 1, 'tokenizing': 1, 'essential': 1, 'in': 1, 'nlp': 1}


## Bag of Words from Scratch

## Step 1: Preprocessing of text data

We define a simple function to process the text including tokenization, lowercasing, and deleation of puncuation.

In [7]:
from collections import defaultdict
import string

# Sample text data: sentences
corpus = [
    "Python is amazing and fun.",
    "Python is not just fun but also powerful.",
    "Learning Python is fun!",
]
# Function to preprocess text
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # Tokenize: split the text into words
    tokens = text.split()
    return tokens

# Apply preprocessing to the sample corpus
processed_corpus = [preprocess(sentence) for sentence in corpus]
print(processed_corpus)

[['python', 'is', 'amazing', 'and', 'fun'], ['python', 'is', 'not', 'just', 'fun', 'but', 'also', 'powerful'], ['learning', 'python', 'is', 'fun']]


## Step 2: Vocabulary

Scan the text and build a list of uniquely-occuring words.

In [8]:
# Initialize an empty set for the vocabulary
vocabulary = set()

# Build the vocabulary
for sentence in processed_corpus:
    vocabulary.update(sentence)

# Convert to a sorted list
vocabulary = sorted(list(vocabulary))
print("Vocabulary:", vocabulary)

Vocabulary: ['also', 'amazing', 'and', 'but', 'fun', 'is', 'just', 'learning', 'not', 'powerful', 'python']


## Step 3: Calculate word frequency and vectorization

In [9]:
def create_bow_vector(sentence, vocab):
    vector = [0] * len(vocab)  # Initialize a vector of zeros
    for word in sentence:
        if word in vocab:
            idx = vocab.index(word)  # Find the index of the word in the vocabulary
            vector[idx] += 1  # Increment the count at that index
    return vector

In [10]:
# Create BoW vector for each sentence in the processed corpus
bow_vectors = [create_bow_vector(sentence, vocabulary) for sentence in processed_corpus]
print("Bag of Words vectors:")
for vector in bow_vectors:
    print(vector)

Bag of Words vectors:
[0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1]
[1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1]
[0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1]


## Alternative: Scikit-learn `CountVectorizer`

Manual creation is good for understanding but `scikit-lean` provides efficient, optimized implementations.

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
# Original corpus
corpus = [
    "Python is amazing and fun.",
    "Python is not just fun but also powerful.",
    "Learning Python is fun!",
]
# Create a CountVectorizer Object
vectorizer = CountVectorizer()
# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)
# Print the generated vocabulary
print("Vocabulary:", vectorizer.get_feature_names_out())
# Print the Bag-of-Words matrix
print("BoW representation:")
print(X.toarray())

Vocabulary: ['also' 'amazing' 'and' 'but' 'fun' 'is' 'just' 'learning' 'not'
 'powerful' 'python']
BoW representation:
[[0 1 1 0 1 1 0 0 0 0 1]
 [1 0 0 1 1 1 1 0 1 1 1]
 [0 0 0 0 1 1 0 1 0 0 1]]


## TF-IDF: Extension of Bag of Words

Term Frequency (TF) represents the frequency in a document. Inverse Document Frequency (IDF) reduced the influence of a word across many documents. The TF-IDF metric is calculated by multiplication.

We use `scikit-learn`'s `TfidfVectorizer` function.

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Sample corpus
corpus = [
    "Python is amazing and fun.",
    "Python is not just fun but also powerful.",
    "Learning Python is fun!",
]

# Create the Tf-idf vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Show the Vocabulary
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())

# Show the TF-IDF Matrix
print("TF-IDF Representation:")
print(X_tfidf.toarray())

Vocabulary: ['also' 'amazing' 'and' 'but' 'fun' 'is' 'just' 'learning' 'not'
 'powerful' 'python']
TF-IDF Representation:
[[0.         0.57292883 0.57292883 0.         0.338381   0.338381
  0.         0.         0.         0.         0.338381  ]
 [0.40667606 0.         0.         0.40667606 0.24018943 0.24018943
  0.40667606 0.         0.40667606 0.40667606 0.24018943]
 [0.         0.         0.         0.         0.41285857 0.41285857
  0.         0.69903033 0.         0.         0.41285857]]


## Word2Vec: Alice in Wonderland

In [13]:
# importing all necessary modules
from gensim.models import Word2Vec
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
 
warnings.filterwarnings(action='ignore')
 
 
#  Reads ‘alice.txt’ file
sample = open("alice_in_wonderland.txt")
s = sample.read()
 
# Replaces escape character with space
f = s.replace("\n", " ")
 
data = []
 
# iterate through each sentence in the file
for i in sent_tokenize(f):
    temp = []
 
    # tokenize the sentence into words
    for j in word_tokenize(i):
        temp.append(j.lower())
 
    data.append(temp)
 
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count=1,
                                vector_size=100, window=5)
 
# Print results
print("Cosine similarity between 'alice' " +
      "and 'wonderland' - CBOW : ",
      model1.wv.similarity('alice', 'wonderland'))
 
print("Cosine similarity between 'alice' " +
      "and 'machines' - CBOW : ",
      model1.wv.similarity('alice', 'machines'))
 
# Create Skip Gram model
model2 = gensim.models.Word2Vec(data, min_count=1, vector_size=100,
                                window=5, sg=1)
 
# Print results
print("Cosine similarity between 'alice' " +
      "and 'wonderland' - Skip Gram : ",
      model2.wv.similarity('alice', 'wonderland'))
 
print("Cosine similarity between 'alice' " +
      "and 'machines' - Skip Gram : ",
      model2.wv.similarity('alice', 'machines'))

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.9907446
Cosine similarity between 'alice' and 'machines' - CBOW :  0.9346203
Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.8836416
Cosine similarity between 'alice' and 'machines' - Skip Gram :  0.89018065
