In [None]:
# Connect to Snowflake and write the data to a table
from snowflake.snowpark.session import Session

connection_params = {
  "account": "<your_account>",
  "user": "<your_username>",
  "password": "<your_password>",
  "role": "ACCOUNTADMIN",
  "warehouse": "COMPUTE_WH",
  "database": "AVALANCHE_DB",
  "schema": "AVALANCHE_SCHEMA"
}

session = Session.builder.configs(connection_params).create()

In [None]:
# Load parsed reviews into a DataFrame
df = session.table("parsed_reviews")

# Show a sample of the data
df.show()

In [None]:
# View schema to understand column structure
df.schema

# Count the number of records
df.count()

In [None]:
from snowflake.snowpark.functions import col

# Remove rows with missing or empty text
df_cleaned = df.filter(
    col("CUSTOMER_REVIEW").is_not_null() & (col("CUSTOMER_REVIEW") != "")
)

In [None]:
from snowflake.snowpark.functions import lower, trim
# Standardize the review text
df_standardized = df_cleaned.with_column("CUSTOMER_REVIEW", trim(lower(col("CUSTOMER_REVIEW"))))

In [None]:
print("Column names:", df_standardized.columns)

In [None]:
from snowflake.snowpark.functions import split, col, lit, array_size

# Correct way - wrap the delimiter in lit() as required by Snowpark
df_with_word_count = df_standardized.with_column(
    "WORD_COUNT", 
    array_size(split(col("CUSTOMER_REVIEW"), lit(" ")))
)

# Display the result
df_with_word_count.select("CUSTOMER_REVIEW", "WORD_COUNT").show()