In [None]:
# Connect to Snowflake and write the data to a table
from snowflake.snowpark.session import Session

connection_params = {
  "account": "<your_account>",
  "user": "<your_username>",
  "password": "<your_password>",
  "role": "ACCOUNTADMIN",
  "warehouse": "COMPUTE_WH",
  "database": "AVALANCHE_DB",
  "schema": "AVALANCHE_SCHEMA"
}

session = Session.builder.configs(connection_params).create()

In [None]:
# Load parsed reviews into a DataFrame
df = session.table("parsed_reviews")

# Show a sample of the data
df.show()

In [None]:
# View schema to understand column structure
df.schema

# Count the number of records
df.count()

In [None]:
from snowflake.snowpark.functions import col

# Remove rows with missing or empty text
df_cleaned = df.filter(
    col("review_text").is_not_null() & (col("review_text") != "")
)

In [None]:
from snowflake.snowpark.functions import lower, trim

# Standardize the review text
df_lowercase = df_cleaned.with_column("review_text", trim(lower(col("review_text"))))

In [None]:
# Drop duplicate rows based on review_text
df_standardized = df_lowercase.drop_duplicates(["review_text"])

In [None]:
from snowflake.snowpark.functions import split, size
# Add a new column with word count (splits text by spaces)
df_standardized = df_standardized.with_column("word_count", size(split(col("review_text"), " ")))

# Show the review text and word count
df_standardized.select("review_text", "word_count").show()

In [None]:
# Save cleaned data to a new table
df_standardized.write.mode("overwrite").save_as_table("clean_reviews")