In [None]:
-- Parse content from DOCX files
WITH files AS (
  SELECT 
    REPLACE(REGEXP_SUBSTR(file_url, '[^/]+$'), '%2e', '.') as filename
  FROM DIRECTORY('@avalanche_db.avalanche_schema.avalanche_stage')
  WHERE filename LIKE '%.docx'
)
SELECT 
  filename,
  SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
    @avalanche_db.avalanche_schema.avalanche_stage,
    filename,
    {'mode': 'layout'}
  ):content AS layout
FROM files;

In [None]:
-- Extract PRODUCT name, DATE, and CUSTOMER_REVIEW from the LAYOUT column
SELECT
    filename,
    REGEXP_SUBSTR(layout, 'Product: (.*?) Date:', 1, 1, 'e') as product,
    REGEXP_SUBSTR(layout, 'Date: (202[0-9]-[0-9]{2}-[0-9]{2})', 1, 1, 'e') as date,
    CASE 
        WHEN POSITION('Customer Review' IN layout) > 0 THEN
            SUBSTRING(layout, POSITION('Customer Review' IN layout) + LENGTH('Customer Review'))
        ELSE NULL
    END as customer_review
FROM {{cell1}};

In [None]:
-- Create a table to store the parsed and structured document content
CREATE OR REPLACE TABLE avalanche_db.avalanche_schema.parsed_reviews (
    filename VARCHAR,
    product VARCHAR,
    review_date DATE,
    customer_review TEXT
);

-- Insert the parsed and structured content into the table
INSERT INTO avalanche_db.avalanche_schema.parsed_reviews (filename, product, review_date, customer_review)
SELECT
    filename,
    product,
    TO_DATE(date) as review_date,
    customer_review
FROM {{cell2}};  -- Assuming cell2 contains your successful extraction query

In [None]:
SELECT * FROM AVALANCHE_DB.AVALANCHE_SCHEMA.PARSED_REVIEWS;