# Unit testing of LBD_02_data_preprocessing.ipynb with pytest and ipytest

Unit testing is a good practice in software development with the pupose to improve the reliability and quality of code by verifying that individual components (units) of a program code work as expected. It enables the code to be more maintainable, scalable, and robust.

In this notebook we use `pytest` library, which is a popular testing framework for Python that is best known for its simplicity and flexibility. `ipytest` is an extension of `pytest` designed specifically for use in Jupyter notebooks. It enables running `pytest` tests directly within a notebook, making it ideal for environments where data exploration, interactive analysis, and incremental development are common.

In [1]:
import logging

# Initialize logging with a basic configuration
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s: %(levelname)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

In [None]:
import import_ipynb
import LBD_01_data_acquisition
import LBD_02_data_preprocessing

In [3]:
import nltk
import numpy as np
import itertools
import pandas as pd
# import pickle
# import json
import spacy
from typing import List, Dict
import re

In [4]:
import ipytest
ipytest.autoconfig()

In [None]:
# Test suite for the function LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list)

def test_do_clean_text_empty_corpus():
    # Test with an empty corpus
    corpus = []
    keep_list = []
    remove_list = []
    expected = []
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_empty_strings_in_corpus():
    # Test with a corpus containing empty strings
    corpus = ["", "   "]
    keep_list = []
    remove_list = []
    expected = ["", ""]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_basic_cleaning():
    # Test basic cleaning without keep_list or remove_list
    corpus = ["This   is, a   sentence! And     another;    one."]
    keep_list = []
    remove_list = []
    expected = ["this is sentence and another one"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_basic_cleaning_foreign_language():
    # Test basic cleaning without keep_list or remove_list
    corpus = ["Tole je, stavek z ločili! Še eden stavek; in ponovno nov stavek. Posebni znaki čšž ČŠŽ."]
    keep_list = []
    remove_list = []
    expected = ["tole je stavek ločili še eden stavek in ponovno nov stavek posebni znaki čšž čšž"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_keep_list():
    # Test with a keep_list
    corpus = ["This is, a sentence! I keep a book and remove the desk."]
    keep_list = ["i", "a"]
    remove_list = []
    expected = ["this is a sentence i keep a book and remove the desk"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_remove_list():
    # Test with a remove_list
    corpus = ["This is, a sentence! Keep a book and remove the desk."]
    keep_list = []
    remove_list = ["book", "the", "desk"]
    expected = ["this is sentence keep and remove"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_keep_and_remove_list():
    # Test with both keep_list and remove_list
    corpus = ["This is, a sentence! I keep a book and remove the desk."]
    keep_list = ["i"]
    remove_list = ["book", "desk"]
    expected = ["this is sentence i keep and remove the"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_numeric_values():
    # Test with numeric values in the corpus
    corpus = ["Hello123 123 world456 456, I would like to 789keep 789 this."]
    keep_list = ["789"]
    remove_list = []
    expected = ["hello123 world456 would like to 789keep 789 this"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_short_words():
    # Test with short words that should be removed
    corpus = ["I saw a quick brown fox."]
    keep_list = ["i"]
    remove_list = ["brown"]
    expected = ["i saw quick fox"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

def test_do_clean_text_mixed_case():
    # Test with mixed case words
    corpus = ["This IS, a SENTENCE! I KEEP a book and REMOVE the desk.!"]
    keep_list = ['sentence']
    remove_list = ['remove']
    expected = ["this is sentence keep book and the desk"]
    assert LBD_02_data_preprocessing.do_clean_text(corpus, keep_list, remove_list) == expected

ipytest.run()

In [None]:
from nltk.corpus import stopwords
wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
stop = set(stopwords.words('english'))
for word in wh_words:
    stop.remove(word)
print("The stow word list: ", stop)

In [None]:
# Test suite for the function LBD_02_data_preprocessing.do_remove_stopwords(corpus)

def test_do_remove_stopwords_empty_corpus():
    # Test with an empty corpus
    corpus = LBD_02_data_preprocessing.do_clean_text([], [], [])
    expected = []
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_no_stopwords_in_corpus():
    # Test when the corpus has no stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["Python programming", "Machine Learning"], [], [])
    expected = [["python", "programming"], ["machine", "learning"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_all_stopwords_in_corpus():
    # Test when the corpus is filled with stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["the a in on", "and but if or"], [], [])
    expected = [[], []]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_mixed_corpus_with_stopwords():
    # Test with a mixed corpus containing stopwords and non-stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["This is a test", "I am reading LBD book"], ["i"], [])
    expected = [["test"], ["reading", "lbd", "book"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_wh_words_preserved():
    # Test to ensure wh-words are preserved
    corpus = LBD_02_data_preprocessing.do_clean_text(["Who are you", "Why is this happening", "Where is the book used"], [], [])
    expected = [["who"], ["why", "happening"], ["where", "book", "used"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_case_insensitivity():
    # Test to ensure the function handles mixed case correctly
    corpus = LBD_02_data_preprocessing.do_clean_text(["This is A Next Test", "Who Knows Why"], [], [])
    expected = [["next", "test"], ["who", "knows", "why"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

def test_do_remove_stopwords_numbers_and_symbols():
    # Test to ensure numbers and symbols are not considered stopwords
    corpus = LBD_02_data_preprocessing.do_clean_text(["100 euros", "This book is #1"], [], [])
    expected = [["euros"], ["book"]]
    assert LBD_02_data_preprocessing.do_remove_stopwords(corpus) == expected

ipytest.run()


In [None]:
from nltk.stem import WordNetLemmatizer

# Test suite for the function do_lemmatize
def test_do_lemmatize_empty_corpus():
    # Test with an empty corpus
    corpus = []
    expected = []
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

def test_do_lemmatize_single_word():
    # Test with a corpus containing a single word
    corpus = [["running"]]
    expected = [["run"]]
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

def test_do_lemmatize_multiple_words():
    # Test with a corpus containing multiple words that need lemmatization
    corpus = [["running", "jumps", "easily"]]
    expected = [["run", "jump", "easily"]]
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

def test_do_lemmatize_no_lemmatization_needed():
    # Test with words that do not need lemmatization
    corpus = [["run", "jump", "easily"]]
    expected = [["run", "jump", "easily"]]
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

def test_do_lemmatize_mixed_case():
    # Test to ensure case-insensitive lemmatization
    corpus = [["Running", "JUMPS", "eAsIlY"]]
    expected = [["Running", "JUMPS", "eAsIlY"]]
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

def test_do_lemmatize_nouns_as_verbs():
    # Test with nouns used as verbs
    corpus = [["books", "flies", "leaves"]]
    expected = [["book", "fly", "leave"]]
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

def test_do_lemmatize_irregular_verbs():
    # Test with irregular verbs
    corpus = [["went", "done", "taken", "put"]]
    expected = [["go", "do", "take", "put"]]
    assert LBD_02_data_preprocessing.do_lemmatize(corpus) == expected

# Run the tests
ipytest.run()

In [None]:
#remove later
import re

def remove_unwanted_characters(text):
    # Define the regex pattern to match any character that is not a-z or 0-9
    pattern = '[^a-z0-9]'
    # Replace all characters that match the pattern with an empty string
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text

# Example usage:
text = "Hello, World! 1234. Welcome to Python @2024.!"
cleaned_text = remove_unwanted_characters(text.lower())  # Convert to lowercase before filtering
print("Cleaned Text:", cleaned_text)


In [None]:
# Cell 2: Write a test
def test_addition():
    assert 1 + 1 == 2
    
# Cell 3: Run tests
ipytest.run('-vv')

# https://medium.com/@mefengl/using-pytest-in-jupyter-notebooks-a-practical-guide-1ba8e02af288

def my_func(x):
    return x // 2 * 2 

%%ipytest
#To execute test, just decorate the cells containing tests with the %%ipytest magic:

# define the tests

def test_my_func():
    assert my_func(0) == 0
    assert my_func(1) == 0
    assert my_func(2) == 2
    assert my_func(3) == 2

#To execute tests without IPython magics use the ipytest.run function
ipytest.run()

%%ipytest
#Using pytest fixtures - Common pytest features, such as fixtures and parametrize, are supported out of the box:

import pytest

@pytest.mark.parametrize('input,expected', [
    (0, 0),
    (1, 0),
    (2, 2),
    (3, 2),
])

def test_parametrized(input, expected):
    assert my_func(input) == expected

@pytest.fixture
def my_fixture():
    return 42
    
def test_fixture(my_fixture):
    assert my_fixture == 42   