In [0]:
spark.sql("USE CATALOG ecommerce_prod")
from pyspark.sql import functions as F
from pyspark.sql.window import Window
import numpy as np

print("=== DAY 11: Statistical Analysis & Feature Engineering ===\n")

=== DAY 11: Statistical Analysis & Feature Engineering ===



##1: Calculate Statistical Summaries

In [0]:
print("1. STATISTICAL SUMMARIES (Price Analysis)")

# Get price statistics
stats_df = spark.table("silver.events_nov").select("price").summary()
stats_df.show()

# Additional descriptive stats
print("\nDetailed Price Statistics:")
events_df = spark.table("silver.events_nov")
events_df.describe(["price"]).show()

# Calculate mean and median separately
price_stats = events_df.select(
    F.mean("price").alias("mean_price"),
    F.expr("percentile_approx(price, 0.5)").alias("median_price"),
    F.stddev("price").alias("std_price"),
    F.min("price").alias("min_price"),
    F.max("price").alias("max_price")
)
price_stats.show()

1. STATISTICAL SUMMARIES (Price Analysis)
+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean|292.45931656482594|
| stddev| 355.6744995860673|
|    min|               0.0|
|    25%|             69.24|
|    50%|            165.77|
|    75%|            360.34|
|    max|           2574.07|
+-------+------------------+


Detailed Price Statistics:
+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|          67501979|
|   mean|292.45931656482594|
| stddev| 355.6744995860673|
|    min|               0.0|
|    max|           2574.07|
+-------+------------------+

+-----------------+------------+-----------------+---------+---------+
|       mean_price|median_price|        std_price|min_price|max_price|
+-----------------+------------+-----------------+---------+---------+
|292.4593165646298|      165.77|355.6744995860673|      0.0|  2574.07|
+-----------------+------------+---

## 2: Test Hypotheses (Weekday vs Weekend)

In [0]:
print("\n2. HYPOTHESIS TESTING: Weekday vs Weekend Performance")

# Add weekend flag (1 = Sunday, 7 = Saturday in Spark)
events_with_days = events_df.withColumn(
    "event_date", F.to_date("event_time")  
).withColumn(
    "is_weekend", F.dayofweek("event_date").isin([1, 7])
)

# Compare performance metrics
hypothesis_test = events_with_days.groupBy("is_weekend").agg(
    F.count("*").alias("total_actions"),
    F.count_if(F.col("event_type") == "purchase").alias("total_purchases"),
    F.round(
        (F.count_if(F.col("event_type") == "purchase") / F.count("*") * 100), 
        2
    ).alias("conversion_rate_percent"),
    F.avg("price").alias("avg_price"),
    F.stddev("price").alias("price_stddev")
)

hypothesis_test.show()


2. HYPOTHESIS TESTING: Weekday vs Weekend Performance
+----------+-------------+---------------+-----------------------+------------------+------------------+
|is_weekend|total_actions|total_purchases|conversion_rate_percent|         avg_price|      price_stddev|
+----------+-------------+---------------+-----------------------+------------------+------------------+
|      true|     24748486|         416681|                   1.68|294.68001483526353|355.63219919252975|
|     false|     42753493|         500258|                   1.17|291.17383282526623|355.69265198257176|
+----------+-------------+---------------+-----------------------+------------------+------------------+



## 3: A/B Test Design & Significance Testing

In [0]:
print("\n3. A/B TEST DESIGN & SIGNIFICANCE TESTING")

# Randomly assign users to Group A (Control) or Group B (Treatment)
ab_data = events_df.withColumn(
    "ab_group", 
    F.when(F.hash("user_id") % 2 == 0, "A").otherwise("B")
)

# Aggregate conversion data per group
group_stats = ab_data.groupBy("ab_group").agg(
    F.count_distinct("user_id").alias("total_users"),
    F.count_if(F.col("event_type") == "purchase").alias("total_purchases")
).collect()

# Extract values for significance testing
group_a = [row for row in group_stats if row["ab_group"] == "A"][0]
group_b = [row for row in group_stats if row["ab_group"] == "B"][0]

conv_a = group_a["total_purchases"] / group_a["total_users"] * 100
conv_b = group_b["total_purchases"] / group_b["total_users"] * 100

print(f"Group A (Control): {group_a['total_users']:,} users, "
      f"{group_a['total_purchases']:,} purchases, "
      f"Conversion: {conv_a:.4f}%")
print(f"Group B (Treatment): {group_b['total_users']:,} users, "
      f"{group_b['total_purchases']:,} purchases, "
      f"Conversion: {conv_b:.4f}%")


3. A/B TEST DESIGN & SIGNIFICANCE TESTING
Group A (Control): 1,847,152 users, 457,989 purchases, Conversion: 24.7943%
Group B (Treatment): 1,848,965 users, 458,950 purchases, Conversion: 24.8220%


##4: Identify Correlations

In [0]:
print("\n4. CORRELATION ANALYSIS")

# Calculate correlation between price and purchase likelihood
purchase_correlation_df = events_df.withColumn(
    "is_purchase", F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

# Filter for numeric price values
numeric_events = purchase_correlation_df.filter(F.col("price") > 0)

try:
    correlation = numeric_events.stat.corr("price", "is_purchase")
    print(f"Correlation between price and purchase likelihood: {correlation:.4f}")
    
    if correlation > 0.1:
        print("Interpretation: Positive correlation")
    elif correlation < -0.1:
        print("Interpretation: Negative correlation")
    else:
        print("Interpretation: Weak correlation")
except Exception as e:
    print(f"Correlation calculation error: {e}")


4. CORRELATION ANALYSIS
Correlation between price and purchase likelihood: 0.0023
Interpretation: Weak correlation


##5: Feature Engineering for ML

In [0]:
print("\n5. FEATURE ENGINEERING FOR MACHINE LEARNING")

# Define window for user-level features
user_window = Window.partitionBy("user_id").orderBy("event_time")

# Create ML-ready features
ml_features = events_df.withColumn(
    "prev_event_time", F.lag("event_time").over(user_window)
).withColumn(
    "time_gap", 
    F.when(
        F.col("prev_event_time").isNotNull(),
        F.unix_timestamp("event_time") - F.unix_timestamp("prev_event_time")
    ).otherwise(None)
).withColumn(
    "is_new_session", 
    F.when(
        F.col("time_gap").isNull() | (F.col("time_gap") > 1800), 
        1
    ).otherwise(0)
).withColumn(
    "user_session_count", 
    F.sum("is_new_session").over(
        Window.partitionBy("user_id")
        .orderBy("event_time")
        .rowsBetween(Window.unboundedPreceding, 0)
    )
).withColumn(
    "total_user_actions", 
    F.count("*").over(Window.partitionBy("user_id"))
).withColumn(
    "hour", F.hour("event_time")
).withColumn(
    "day_of_week", F.dayofweek(F.to_date("event_time"))
).withColumn(
    "price_log", F.log(F.col("price") + 1)
).withColumn(
    # FIXED: Convert both to unix_timestamp before subtracting
    "time_since_first_view",
    F.unix_timestamp("event_time") - 
    F.unix_timestamp(
        F.first("event_time").over(
            Window.partitionBy("user_id").orderBy("event_time")
        )
    )
).withColumn(
    "is_purchase", F.when(F.col("event_type") == "purchase", 1).otherwise(0)
)

# Show sample of engineered features
print("Sample of ML Features Created:")
ml_features.select(
    "user_id", "event_time", "event_type", "price", 
    "time_gap", "user_session_count", "hour", "day_of_week",
    "price_log", "time_since_first_view", "is_purchase"
).show(10, truncate=False)

# Save to Gold layer 
try:
    ml_features.write.mode("overwrite").saveAsTable("gold.ml_ready_features")
    print("\n✓ ML features saved to: gold.ml_ready_features")
except Exception as e:
    print(f"\nNote: Could not save to gold layer: {e}")


5. FEATURE ENGINEERING FOR MACHINE LEARNING
Sample of ML Features Created:
+--------+-------------------+----------+------+--------+------------------+----+-----------+------------------+---------------------+-----------+
|user_id |event_time         |event_type|price |time_gap|user_session_count|hour|day_of_week|price_log         |time_since_first_view|is_purchase|
+--------+-------------------+----------+------+--------+------------------+----+-----------+------------------+---------------------+-----------+
|65800726|2019-11-27 04:33:16|view      |81.8  |NULL    |1                 |4   |4          |4.416428061391214 |0                    |0          |
|65800726|2019-11-27 04:35:24|view      |81.8  |128     |1                 |4   |4          |4.416428061391214 |128                  |0          |
|81255481|2019-11-08 07:44:45|view      |66.35 |NULL    |1                 |7   |6          |4.209902902856373 |0                    |0          |
|81255481|2019-11-21 14:11:26|view      |6

##SUMMARY

In [0]:
print("\n" + "="*60)
print("DAY 11 COMPLETED: Statistical Analysis & Feature Engineering")
print("="*60)
print("✓ All 5 tasks completed successfully!")
print("="*60)


DAY 11 COMPLETED: Statistical Analysis & Feature Engineering
✓ All 5 tasks completed successfully!
