In [0]:
from pyspark.sql import functions as F
from pyspark.sql import SparkSession


In [0]:

# Get or create Spark session
spark = SparkSession.builder.getOrCreate()

# Sample data (replace with your actual DataFrame loading logic)
data = [
    (1, 25, 50000.0, "Male", "New York"),
    (2, 30, 60000.0, "Female", "San Francisco"),
    (3, 45, 80000.0, "Male", "Chicago"),
    (4, 22, 45000.0, "Female", "Boston")
]

columns = ["id", "age", "income", "gender", "city"]

# Create DataFrame
df = spark.createDataFrame(data, columns)

# Display the DataFrame
df.show()

In [0]:
df.createOrReplaceTempView("my_table")

In [0]:
# Get column names as a comma-separated string
col_names = ", ".join(df.columns)
print("Column names: " + col_names)

In [0]:
# Define the prompt for the LLM
prompt = f"""
Given a dataset with the following column names: {col_names}.

Suggest 5 feature engineering ideas to improve machine learning model performance. For each idea:
- Describe the new feature.
- Explain why it might be useful.
- Provide the PySpark code snippet to create it (using functions like withColumn, when, etc.).

Output the suggestions in a numbered list.
"""

# Run SQL query with ai_query to get suggestions
suggestions_df = spark.sql(f"""
SELECT ai_query(
  'databricks-llama-4-maverick',
  '{prompt}'
) AS feature_suggestions
""")

# Collect and print the result (since it's a single row)
suggestions = suggestions_df.collect()[0]["feature_suggestions"]
print("LLM Suggested Feature Engineering Ideas:\n")
print(suggestions)

In [0]:
# Example implementation based on LLM suggestion (replace with actual suggestions)
from pyspark.sql.functions import when, col

df_engineered = df.withColumn(
    "age_group",
    when(col("age") < 30, "Young")
    .when((col("age") >= 30) & (col("age") < 50), "Adult")
    .otherwise("Senior")
)

df_engineered.show()

In [0]:
df_engineered = df_engineered.withColumn("income_bracket", 
                   F.when(F.col("income") < 30000, "Low")
                   .when((F.col("income") >= 30000) & (F.col("income") < 60000), "Medium")
                   .otherwise("High"))

df_engineered.show()

In [0]:
df_engineered = df_engineered.withColumn("age_income_interaction", F.col("age") * F.col("income"))

In [0]:
df_engineered = df_engineered.withColumn("gender_age_group", F.concat(F.col("gender"), F.lit("_"), F.col("age_group")))

In [0]:
df_engineered.show()