In [1]:
import json
import re
import glob
import collections
from tqdm import tqdm 
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
folder_text = "../../../data/OpenAI/TextSnippets/"

caribbean_text_dict = pickle.load(open(F"{folder_text}paragraphs_caribbean.pkl", 'rb'))
palms_text_dict = pickle.load(open(F"{folder_text}paragraphs_palms.pkl", 'rb'))
plantnet_text_dict = pickle.load(open(F"{folder_text}paragraphs_plantnet.pkl", 'rb'))

In [3]:
def is_valid_text(text):
    # Define a regular expression pattern that matches valid text
    pattern = r'^[a-zA-Z0-9 ,.\'-]+$'
    # Use the re.match() function to test if the text matches the pattern
    match = re.match(pattern, text)
    # Return True if the text matches the pattern, False otherwise
    return bool(match)

def regex_cleaner(string):
    # Define a list of regular expression patterns and their replacements
    cleaners = [
        # Replace multiple consecutive whitespace characters (spaces, tabs, newlines) with a single space character
        ("\s+", " "),
        # Replace multiple consecutive newline characters with a single newline character
        ("\n+", "\n"),
        # Replace multiple consecutive tab characters with a single tab character
        ("\t+", "\t"),
    ]
    
    # Apply each regular expression pattern and its replacement to the input string
    for (cleaner, replacement) in cleaners:
        string = re.sub(cleaner, replacement, string)
    
    # Return the cleaned string
    return string.strip()

def filter_species_dict(text_dict):
    """
    Filters the descriptions in a dictionary of species, removing invalid text and duplicates.

    Args:
        text_dict (dict): A dictionary where the keys are the species names and the values are lists of text descriptions.

    Returns:
        dict: A filtered dictionary where the keys are the species names and the values are lists of valid and unique text descriptions.
    """

    valid_species_dict = {}

    # Loop through each species and its descriptions in the dictionary
    for idx, (species, descriptions) in enumerate(tqdm(text_dict.items(), leave=False, position=0)):
        # Create a progress bar for the species
        species_description = f"{idx} {species}"
        species_pbar = tqdm(descriptions, leave=False, position=1, desc=species_description)

        valid_descriptions = []
        # Loop through each description for the species
        for description in species_pbar:
            # Clean the description using regex_cleaner
            cleaned_description = regex_cleaner(description)
            # Check if the cleaned description is valid and append it to valid_descriptions if it is
            # if is_valid_text(cleaned_description):
            #     valid_descriptions.append(cleaned_description)
            if len(cleaned_description) < 100000 and len(cleaned_description) > 1 and not cleaned_description.isspace():
                valid_descriptions.append(cleaned_description)

        # Remove any duplicate descriptions in the list
        valid_descriptions = list(set(valid_descriptions))
        # Add the valid descriptions for the species to the valid_species_dict
        valid_species_dict[species] = valid_descriptions

    return valid_species_dict


In [4]:
# caribbean_text_dict_filtered = filter_species_dict(caribbean_text_dict)
# palms_text_dict_filtered = filter_species_dict(palms_text_dict)
plantnet_text_dict_filtered = filter_species_dict(plantnet_text_dict)

                                                 

In [6]:
folder_text = "../../../data/OpenAI/TextSnippetsCleaned/"

# with open(F"{folder_text}paragraphs_caribbean_cleaned.pkl", 'wb') as f:
#     pickle.dump(caribbean_text_dict_filtered, f)

# with open(F"{folder_text}paragraphs_palms_cleaned.pkl", 'wb') as f:
#     pickle.dump(palms_text_dict_filtered, f)

with open(F"{folder_text}paragraphs_plantnet_cleaned.pkl", 'wb') as f:
    pickle.dump(plantnet_text_dict_filtered, f)