<a href="https://colab.research.google.com/github/janbradlie/jhay/blob/main/Python_Script_for_Product_Description_Transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import io
import re
import html

def strip_html_tags(html_content):
    """
    Strips HTML tags from a string, converting <br>, <p>, <li> to newlines first.
    Handles HTML entities.
    """
    if not html_content:
        return ""
    text = str(html_content)
    # Convert <br> tags to newlines
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    # Convert </p> to double newlines to preserve paragraph breaks
    text = re.sub(r'</p\s*>', '\n\n', text, flags=re.IGNORECASE)
    # Convert <li> content to be on new lines, marked with an asterisk
    text = re.sub(r'<li[^>]*>', '\n* ', text, flags=re.IGNORECASE) # Add asterisk for list items
    # Strip all other HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Unescape HTML entities like &lt; &amp; etc.
    text = html.unescape(text)

    # Clean up whitespace:
    # Split into lines, strip each line, then rejoin. This removes leading/trailing whitespace from lines.
    lines = [line.strip() for line in text.split('\n')]
    # Filter out lines that became empty after stripping, but preserve intentional paragraph breaks (double newlines)

    cleaned_lines = []
    for i, line in enumerate(lines):
        if line: # If line has content
            cleaned_lines.append(line)
        elif i > 0 and lines[i-1]: # If it's an empty line following a non-empty one, it could be a paragraph break
            if not (cleaned_lines and not cleaned_lines[-1]): # Avoid multiple empty lines if already added
                 cleaned_lines.append("") # Add one empty line for paragraph separation

    text = '\n'.join(cleaned_lines)
    text = re.sub(r'\n{3,}', '\n\n', text) # Reduce 3+ newlines to 2
    text = text.strip() # Remove leading/trailing newlines from the whole text.
    return text

def format_content_to_html_elements(content_list):
    """
    Formats a list of text paragraphs/items into HTML <p> and <ul> elements.
    Detects list items starting with '* '.
    """
    section_html_parts = []
    current_list_items = []

    for text_item in content_list:
        text_item_stripped = text_item.strip()
        if not text_item_stripped: # Skip empty lines
            continue

        if text_item_stripped.startswith("* "):
            # If it's a list item, add it to current_list_items
            # Remove the leading '* '
            current_list_items.append(f"<li>{text_item_stripped[2:].strip()}</li>")
        else:
            # If it's not a list item, first close any open list
            if current_list_items:
                section_html_parts.append("<ul>\n" + "\n".join(current_list_items) + "\n</ul>")
                current_list_items = []
            # Then add the paragraph
            section_html_parts.append(f"<p>{text_item_stripped}</p>")

    # After the loop, if there are any remaining list items, close the list
    if current_list_items:
        section_html_parts.append("<ul>\n" + "\n".join(current_list_items) + "\n</ul>")

    return "\n".join(section_html_parts)

def generate_structured_html(description_text):
    """
    Generates structured HTML from plain text description.
    Tries to identify sections based on keywords relevant to flower products.
    """
    if not description_text:
        return "<p>Discover this beautiful selection, handcrafted with care.</p>" # Default for empty

    # Split text into paragraphs based on one or more newlines
    paragraphs = [p.strip() for p in re.split(r'\n\s*\n*', description_text) if p.strip()]

    if not paragraphs:
         return "<p>Discover this beautiful selection, handcrafted with care.</p>"

    # Define sections and their keywords (case-insensitive)
    # Order can influence assignment if text matches multiple sections
    section_definitions = [
        ("Description", ["description", "overview", "details", "experience", "discover", "celebrate", "make it special", "surrender to", "embrace", "capture", "journey through", "harness", "announce", "honor", "float away", "convey gentle"]),
        ("What's Included", ["include", "contains", "consist", "composed of", "feature", "arrangement comprises", "recipe", "what's in it", "key features", "ingredients", "components"]),
        ("Ideal For", ["perfect for", "ideal for", "great for", "occasion", "celebrate", "suited for", "recommended for", "use for"]),
        ("Flower Care", ["care", "instruction", "maintain", "keep fresh", "prolong", "vase life", "tip", "how to care"]),
        ("Delivery Information", ["delivery", "shipping", "dispatch", "deliver", "same-day delivery"])
    ]
    # Common phrases that are more like slogans or general info
    common_phrases_to_group = [
        "make it special, make it fresh.",
        "order now—because flowers say it best.",
        "handcrafted in manila",
        "comes with care tips + flower food",
        "same-day delivery in metro manila", # This could also be delivery info
        "personalized message included"
    ]

    # Normalize common phrases for matching
    common_phrases_lower = [phrase.lower() for phrase in common_phrases_to_group]

    assigned_paragraphs = [False] * len(paragraphs)
    section_content = {title: [] for title, _ in section_definitions}
    general_info_content = [] # For common phrases

    # First pass: assign to specific sections based on keywords or if paragraph IS a common phrase
    for i, p_text in enumerate(paragraphs):
        p_text_lower = p_text.lower()

        # Check if it's a common phrase to be grouped separately
        is_common_phrase = False
        for common_phrase in common_phrases_lower:
            if common_phrase in p_text_lower: # Using 'in' for flexibility
                # Check if it's a list item from the common list
                if p_text.startswith("* ") and any(cp_item in p_text_lower for cp_item in ["handcrafted in manila", "comes with care tips", "same-day delivery", "personalized message"]):
                    is_common_phrase = True
                    break
                elif not p_text.startswith("* ") and p_text_lower == common_phrase: # Exact match for non-list items
                    is_common_phrase = True
                    break
        if is_common_phrase:
            general_info_content.append(p_text)
            assigned_paragraphs[i] = True
            continue

        # Try to assign to specific sections (non-Description)
        found_section = False
        for title, keywords in section_definitions:
            if title == "Description": continue # Skip Description for now
            if any(keyword in p_text_lower for keyword in keywords):
                # More specific check for delivery
                if title == "Delivery Information" and "same-day delivery in metro manila" in p_text_lower and p_text.startswith("*"):
                     pass # Let common phrases catch this if it's the standard bullet
                else:
                    section_content[title].append(p_text)
                    assigned_paragraphs[i] = True
                    found_section = True
                    break
        if found_section:
            continue

    # Second pass: assign remaining paragraphs to "Description"
    for i, p_text in enumerate(paragraphs):
        if not assigned_paragraphs[i]:
            section_content["Description"].append(p_text)
            assigned_paragraphs[i] = True # Mark as assigned

    # Construct HTML
    final_html_parts = []

    # Add Description section first if it has content
    if section_content["Description"]:
        final_html_parts.append(f"<h3>Description</h3>")
        final_html_parts.append(format_content_to_html_elements(section_content["Description"]))

    # Add other specific sections if they have content
    for title, _ in section_definitions:
        if title == "Description": continue # Already handled
        if section_content[title]:
            final_html_parts.append(f"<h3>{title}</h3>")
            final_html_parts.append(format_content_to_html_elements(section_content[title]))

    # Add the grouped common phrases/general info section
    if general_info_content:
        final_html_parts.append("<h3>Additional Information</h3>") # Or a more fitting title
        final_html_parts.append(format_content_to_html_elements(general_info_content))

    if not final_html_parts: # Fallback if all logic results in no content
        # Use the original full text, minimally formatted
        formatted_fallback = "\n".join([f"<p>{p}</p>" for p in paragraphs if p])
        return f"<h3>Details</h3>\n{formatted_fallback}" if formatted_fallback else "<p>Discover this beautiful selection.</p>"

    return "\n".join(final_html_parts)


def process_csv(csv_content_string):
    """
    Processes the CSV content string to update product descriptions.
    """
    csvfile = io.StringIO(csv_content_string)

    # Sniff to find the delimiter if it's not comma (though Shopify usually uses comma)
    try:
        dialect = csv.Sniffer().sniff(csvfile.read(2048))
        csvfile.seek(0) # Rewind after sniffing
        reader = csv.DictReader(csvfile, dialect=dialect)
    except csv.Error: # Fallback to comma if sniffing fails
        csvfile.seek(0)
        reader = csv.DictReader(csvfile) # Assumes comma delimiter

    output_rows = []

    if not reader.fieldnames:
        raise ValueError("CSV file is empty or header row is missing.")

    fieldnames = reader.fieldnames

    description_column_name = "Body (HTML)"
    if description_column_name not in fieldnames:
        # Attempt to find a similar column if "Body (HTML)" is not exact
        possible_names = [name for name in fieldnames if "body" in name.lower() or "description" in name.lower()]
        if possible_names:
            description_column_name = possible_names[0] # Take the first likely candidate
        else:
            raise ValueError(f"CSV must contain a product description column. Expected '{description_column_name}' or similar.")

    output_fieldnames = list(fieldnames)

    for row_number, row in enumerate(reader):
        # Shopify CSVs can have rows that are just for additional images,
        # these rows often have a Handle but no Title and other primary product data.
        # We only want to process rows that represent a main product entry.
        # A good heuristic is to check if 'Title' is present and not empty for that row.
        # Or, if 'Variant SKU' is present (for main product or first variant)
        # The very first product line will have a Title. Subsequent lines for the same product (e.g. images) might not.
        # We should only apply description changes to rows that originally HAD a description.

        original_description = row.get(description_column_name, "")
        new_row = {field: row.get(field, '') for field in fieldnames} # Ensure all fields are present

        # Only process if there was an original description or if it's a primary product row (e.g. has a title)
        # This check helps avoid adding descriptions to image-only rows if they didn't have one.
        if original_description or row.get("Title"):
            cleaned_text = strip_html_tags(original_description)
            new_html_description = generate_structured_html(cleaned_text)
            new_row[description_column_name] = new_html_description
        else:
            # If it's an image row without its own description, keep its description field as is (likely empty)
            new_row[description_column_name] = original_description

        output_rows.append(new_row)

    output_csvfile = io.StringIO()
    writer = csv.DictWriter(output_csvfile, fieldnames=output_fieldnames)
    writer.writeheader()
    writer.writerows(output_rows)

    return output_csvfile.getvalue()