# ❄️ Snowflake Chat with your Documents Notebook ❄️

Includes:
- Cortex Parse Document
- Cortex Search Service
- Cortex Fine-Tuning

In [None]:
# Import necessary functions
import streamlit as st
from snowflake.snowpark.context import get_active_session
session = get_active_session()

# Define image in a stage and read the file
image=session.file.get_stream("@DATASCIENCECOLLEGE.PUBLIC.RAG/RAG_flow.png" , decompress=False).read() 

# Display the image
st.image(image, width=1000)

In [None]:
import snowflake.snowpark as snowpark

from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
-- List files in the stage to identify PDFs
LS @DATASCIENCECOLLEGE.PUBLIC.RAG;

In [None]:
CREATE OR REPLACE TABLE PARSED_TEXT (relative_path VARCHAR(500), raw_text VARIANT);

In [None]:
from snowflake.snowpark.functions import col, to_variant

# Query to fetch distinct PDF files from the stage
files_df = session.sql("""
    SELECT DISTINCT METADATA$FILENAME AS file_name
    FROM @DATASCIENCECOLLEGE.PUBLIC.RAG
    WHERE METADATA$FILENAME ILIKE '%.pdf'
""").collect()

# Loop through the distinct filenames and parse if not already in the target table
for row in files_df:
    file_name = row['FILE_NAME']
    
    # Check if the file has already been parsed
    check_df = session.table("PARSED_TEXT").filter(col("relative_path") == file_name).select("relative_path").collect()

    # If not already parsed, proceed to parse and insert the text
    if not check_df:
        # Extract raw text using the PARSE_DOCUMENT function
        parse_result = session.sql(f"""
            SELECT SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
                '@DATASCIENCECOLLEGE.PUBLIC.RAG',
                '{file_name}',
                OBJECT_CONSTRUCT('mode', 'OCR')
            ) AS raw_text
        """).collect()
        
        # Ensure parse_result contains data before proceeding
        if parse_result:
            # Get the parsed raw text and cast it to VARIANT
            raw_text = parse_result[0]['RAW_TEXT']
            
            # Create DataFrame with explicit VARIANT type for raw_text
            df_to_insert = session.create_dataframe(
                [(file_name, raw_text)],
                schema=["relative_path", "raw_text"]
            ).select(
                col("relative_path"),
                to_variant(col("raw_text")).alias("raw_text")  # Explicitly cast to VARIANT
            )
            
            # Insert the DataFrame into the PARSED_TEXT table
            df_to_insert.write.mode("append").save_as_table("PARSED_TEXT")

print("PDF files parsed successfully.")

Here's an alternative approach using SQL to create and then call a procedure using the PARSE_DOCUMENT function.

 -- Convert this cell to SQL. Remove the quotes at the top & bottom to run the code.
 ```
 -- Create a procedure that will use PARSE_DOCUMENT to parse distinct PDFs
 CREATE OR REPLACE PROCEDURE parse_pdf()
   RETURNS STRING NOT NULL
   LANGUAGE JAVASCRIPT
   EXECUTE AS CALLER
 AS
 $$
   var file_name;
   var rs = snowflake.execute({
     sqlText: `
       SELECT DISTINCT METADATA$FILENAME AS file_name
       FROM @DATASCIENCECOLLEGE.PUBLIC.RAG
       WHERE METADATA$FILENAME ILIKE '%.pdf'
         AND POSITION('.pdf' IN LOWER(METADATA$FILENAME)) > 0
     `
   });

   while (rs.next()) {
     file_name = rs.getColumnValue(1);

     // Check if the file has already been parsed
     var check_rs = snowflake.execute({
       sqlText: `
         SELECT 1 
         FROM PARSED_TEXT 
         WHERE relative_path = ?
         LIMIT 1
       `,
       binds: [file_name]
     });

     // If the file has not been processed, parse it and insert the result
     if (!check_rs.next()) {
       snowflake.execute({
         sqlText: `
           INSERT INTO PARSED_TEXT (relative_path, raw_text)
           SELECT ?, SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
             '@DATASCIENCECOLLEGE.PUBLIC.RAG',
             ?,
             OBJECT_CONSTRUCT('mode', 'OCR')
           ) AS raw_text
         `,
         binds: [file_name, file_name]
       });
     }
   }
   return 'PDF files parsed successfully.';
 $$;
 ```

-- Convert this to SQL. Remove the quotes at the top & bottom to run the code.
```
-- Use procedure to create one record for each PDF in the stage
-- CALL parse_pdf();
```

In [None]:
SELECT RELATIVE_PATH, RAW_TEXT FROM PARSED_TEXT LIMIT 5;

Now let's chunk our parsed data using langchain (this will be updated to leverage Cortex function when available).

In [None]:
from snowflake.snowpark.types import StructType, StructField, StringType
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define the text chunker class
class text_chunker:

    def process(self, text):        
        text_raw = []
        text_raw.append(text) 
        
        text_splitter = RecursiveCharacterTextSplitter(
            separators=["\n"],  # Define an appropriate separator. New line is good typically!
            chunk_size=1000,     # Adjust this as you see fit
            chunk_overlap=200,    # This lets text have some form of overlap. Useful for keeping chunks contextual
            length_function=len,
            add_start_index=True  # Optional but useful if you'd like to feed the chunk before/after
        )
    
        chunks = text_splitter.create_documents(text_raw)
        
        # Adjust DataFrame creation to match schema (chunk, meta)
        chunk_texts = [chunk.page_content.encode('utf-8', 'ignore').decode('utf-8') for chunk in chunks]
        metas = [str(chunk.metadata) for chunk in chunks]
        
        df = pd.DataFrame({
            'chunk': chunk_texts,
            'meta': metas
        })
        
        yield from df.itertuples(index=False, name=None)

# Register the UDTF
schema = StructType([
     StructField("chunk", StringType()),
     StructField("meta", StringType()),
 ])

session.udtf.register( 
    handler=text_chunker,
    output_schema=schema, 
    input_types=[StringType()], 
    is_permanent=True, 
    name='CHUNK_TEXT', 
    replace=True, 
    packages=['pandas', 'langchain'], 
    stage_location='@DATASCIENCECOLLEGE.PUBLIC.RAG'
)

In [None]:
-- Create the chunked version of your parsed text
CREATE OR REPLACE TABLE DATASCIENCECOLLEGE.PUBLIC.CHUNK_TEXT AS
    SELECT
        raw.relative_path,
        build_scoped_file_url('@DATASCIENCECOLLEGE.PUBLIC.RAG', raw.relative_path) AS file_url,
        CONCAT(raw.relative_path, ': ', func.chunk) AS chunk,
        'English' AS language,
        func.meta AS meta_info
    FROM
        DATASCIENCECOLLEGE.PUBLIC.PARSED_TEXT AS raw,
        TABLE(DATASCIENCECOLLEGE.PUBLIC.CHUNK_TEXT(TO_VARCHAR(raw.raw_text))) AS func;

In [None]:
SELECT * FROM CHUNK_TEXT LIMIT 5;

In [None]:
-- Create a search service over your new chunked pdf table
CREATE OR REPLACE CORTEX SEARCH SERVICE DATASCIENCECOLLEGE.PUBLIC.CHUNK_TEXT_SEARCH_SERVICE
    ON CHUNK
    ATTRIBUTES LANGUAGE
    WAREHOUSE = DATASCIENCECOLLEGE
    TARGET_LAG = '365 days'
    AS (
    SELECT
        CHUNK,
        RELATIVE_PATH,
        LANGUAGE
    FROM DATASCIENCECOLLEGE.PUBLIC.CHUNK_TEXT
    );

# Next Step:

Please create the associated "Chat with your Documents" SiS app. This will allow users to interactively leverage the Cortex Search RAG.

Once users have saved sufficient questions and customized/corrected answers via the SiS app, return to this Snowflake Notebook to create a customized fine-tuned Cortex model.


In [None]:
ALTER TABLE QA_TABLE
ADD COLUMN source VARCHAR;
UPDATE QA_TABLE
SET
  source = CASE
    WHEN RANDOM () <= 0.7 THEN 'train'
    ELSE 'validation'
  END;

In [None]:
SELECT
  SNOWFLAKE.CORTEX.FINETUNE (
    'CREATE',
    'customized_QA_model',
    'mistral-7b',
    'SELECT RAGQUESTION as prompt, RAGANSWER as completion FROM QA_TABLE WHERE source = \'train\'',
    'SELECT RAGQUESTION as prompt, RAGANSWER as completion FROM QA_TABLE WHERE source = \'validation\''
  );

# Next Step:

Return to the "Chat with your Documents" SiS App. 

Update lines 12 to 17, with this code to include your customized_QA_model:

```
ENABLED_CUSTOM_QA_MODELS = True

MODELS = [
    "mistral-large",
    "snowflake-arctic",
    "llama3-70b",
    "llama3-8b",
    "customized_QA_model"
]

if ENABLED_CUSTOM_QA_MODELS:
    MODELS.append( "customized_QA_model")
```

Now update lines 101 to 137, with an updated def init_config_options function. This will provide a checkbox for the user to select to use their custom model.:

```
def init_config_options():
    st.sidebar.selectbox(
        "Select Cortex Search Service:",
        [s["name"] for s in st.session_state.service_metadata],
        key="selected_cortex_search_service",
    )

    clear_button_clicked = st.sidebar.button("Clear conversation")
    if clear_button_clicked:
        st.session_state.clear_conversation = True
        init_messages()
        st.session_state.generated_response = ""
        st.session_state.results = []
        st.session_state.pdf_filename = None
        st.session_state.selected_question_key += 1  # Increment key to reset selected question
        st.session_state.question_key += 1  # Increment key to reset user question
        st.session_state.user_question = ""  # Reset the user-defined question to blank

    use_chat_history = st.sidebar.checkbox(
        "Use chat history", value=st.session_state.use_chat_history
    )
    st.session_state.use_chat_history = use_chat_history

    with st.sidebar.expander("Advanced options"):
        st.selectbox("Select model:", MODELS, key="model_name")
        st.number_input(
            "Select number of context chunks",
            key="num_retrieved_chunks",
            min_value=1,
            max_value=10,
        )
        st.number_input(
            "Select number of messages to use in chat history",
            key="num_chat_messages",
            min_value=1,
            max_value=10,
        )
        use_custom_model = False
        if ENABLED_CUSTOM_QA_MODELS:
            # Add a checkbox to use customized Q&A model
            use_custom_model = st.checkbox("Use customized Q&A model", key="use_customized_qa_model")

    # Determine which model is currently active
    st.session_state.active_model = "customized_QA_model" if use_custom_model else st.session_state.model_name
```

Now at line 197, update the complete function to use the customized_QA_model if the use selects the "Use customized model" checkbox.
```
def complete(model, prompt):
    # Use customized Q&A model if selected
    if st.session_state.get("use_customized_qa_model", False):
        model = "customized_QA_model"
    return Complete(model, prompt).replace("$", "\$")
```

Finally, select the "Run" button for your new code to be used and test your update application. Remember to look in the sidebar "Advanced options" to check the "Use customized Q&A model" when testing new questions. 