In [0]:
selected_df = df.select(
    "customer_id",
    "interaction_id",
    col("date_time").alias("interaction_date"),
    "issue_category",
    "issue_description",
    "agent_id"
)
display(selected_df)

In [0]:
df = spark.table("gen_ai_workshop.himanshu_gupta.cust_service_data")
display(df)

In [0]:
from pyspark.sql.functions import split, trim, col, rtrim

# Select unique customer records
unique_df = df.select(
    "customer_id", "name", "email", "phone_number", "address"
).dropDuplicates(["customer_id", "name", "email", "phone_number", "address"])

display(unique_df)

In [0]:
unique_agents_df = df.select("agent_id").distinct()
display(unique_agents_df)

In [0]:
from pyspark.sql.functions import concat, lit, col
from pyspark.sql import Row

# Generate 50 unique agent names
agent_names_list = [f"Agent_{i+1}" for i in range(50)]

# Assign a unique agent name to each agent_id
agent_ids = [row.agent_id for row in unique_agents_df.collect()]
unique_names = agent_names_list[:len(agent_ids)]

agent_info = [Row(agent_id=agent_id, agent_name=agent_name) for agent_id, agent_name in zip(agent_ids, unique_names)]
agent_info_df = spark.createDataFrame(agent_info)

fake_agents_df = agent_info_df.withColumn(
    "agent_email", concat(lit("agent_"), col("agent_name"), lit("@example.com"))
)

display(fake_agents_df)

In [0]:
from pyspark.sql.functions import rand, when, round as spark_round, monotonically_increasing_id

# Generate survey responses for each interaction
survey_df = df.select(
    "interaction_id",
).withColumn(
    "survey_id", monotonically_increasing_id()
).withColumn(
    "satisfaction_rating", (spark_round(rand() * 4) + 1).cast("integer")  # Ratings 1-5
).withColumn(
    "would_recommend", when(rand() > 0.2, "Yes").otherwise("No")
).withColumn(
    "survey_comment",
    when(col("satisfaction_rating") >= 4, "Great service!")
    .when(col("satisfaction_rating") == 3, "Average experience.")
    .otherwise("Needs improvement.")
)

display(survey_df)