In [None]:
from snowflake.snowpark.context import get_active_session

session = get_active_session()

In [None]:
# Load parsed reviews into a DataFrame
df = session.table("parsed_reviews")

# Show a sample of the data
df.show()

In [None]:
# View schema to understand column structure
df.schema

# Count the number of records
df.count()

In [None]:
from snowflake.snowpark.functions import col

# Remove rows with missing or empty text
df_cleaned = df.filter(
    col("CUSTOMER_REVIEW").is_not_null() & (col("CUSTOMER_REVIEW") != "")
)

In [None]:
from snowflake.snowpark.functions import lower, trim
# Standardize the review text
df_lowercase = df_cleaned.with_column("CUSTOMER_REVIEW", trim(lower(col("CUSTOMER_REVIEW"))))

In [None]:
df_deduped = df_lowercase.drop_duplicates(["CUSTOMER_REVIEW"])

In [None]:
from snowflake.snowpark.functions import split, col, lit, array_size

# Correct way - wrap the delimiter in lit() as required by Snowpark
df_with_word_count = df_deduped.with_column(
    "WORD_COUNT", 
    array_size(split(col("CUSTOMER_REVIEW"), lit(" ")))
)

# Display the result
df_with_word_count.select("CUSTOMER_REVIEW", "WORD_COUNT").show()

In [None]:
df_with_word_count.write.mode("overwrite").save_as_table("clean_reviews")