In [20]:
# PySpark in Microsoft Fabric Notebook

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, count, desc, sum as spark_sum
import os
import pandas as pd

# Create Spark session (Fabric usually provides this automatically)
spark = SparkSession.builder.appName("FabricPySparkAnalysis").getOrCreate()

# Path to CSV file in Fabric Lakehouse or local path
# In Fabric, replace with: "Files/sample_data.csv" or lakehouse path

# Use Fabric path if available, else local path
if os.path.exists("Files/sample_data.csv"):
    data_path = "Files/sample_data.csv"  # Fabric Lakehouse
else:
    data_path = "data/sample_data.csv"   # Local


df = spark.read.option("header", True).option("inferSchema", True).csv(data_path)
try:
    # Load CSV with header and schema inference
    df = spark.read.option("header", True).option("inferSchema", True).csv(data_path)
except Exception as e:
    raise RuntimeError(f"Error loading data: {e}")

# Show first rows
print("üìÑ Raw Data Preview:")
df.show(5)

# Basic data cleaning: remove nulls in important columns
df_clean = df.dropna(subset=["Category", "Value"])
# Cast Value to numeric to avoid mixed-type aggregation errors
df_clean = df_clean.withColumn("Value", col("Value").cast("double"))
# Drop rows where cast produced nulls (non-numeric values)
df_clean = df_clean.na.drop(subset=["Value"])

# Example aggregation: average value per category
agg_df = (
    df_clean.groupBy("Category")
    .agg(avg("Value").alias("AverageValue"), count("*").alias("Count"), spark_sum("Value").alias("TotalValue"))
    .orderBy(desc("Category"))
)

# Show results
print("Aggregated Results:")
agg_df.show()


df_clean.where(col("Value") > 5).show() 

output_path = "data/output"

# Ensure parent directory exists for local runs (Fabric 'Files/' path handled by service)
parent = os.path.dirname(output_path)
if parent:
    os.makedirs(parent, exist_ok=True)

# Write as CSV using Pandas (no Hadoop libraries needed).
# In Fabric, you can use Spark's write.parquet() directly instead.
try:
    pandas_df = agg_df.toPandas()
    csv_file = os.path.join(output_path, "aggregated_data.csv")
    pandas_df.to_csv(csv_file, index=False)
    print(f"‚úÖ Aggregated data written to CSV: {csv_file}")
except Exception as e:
    print(f"‚ùå Error writing CSV to {output_path}: {e}")

üìÑ Raw Data Preview:
+--------+-----+
|Category|Value|
+--------+-----+
|       A|   10|
|       B|   15|
|       A|   20|
|       C|    5|
|       B|   25|
+--------+-----+

Aggregated Results:
+--------+------------+-----+----------+
|Category|AverageValue|Count|TotalValue|
+--------+------------+-----+----------+
|       C|         5.0|    1|       5.0|
|       B|        20.0|    2|      40.0|
|       A|        15.0|    2|      30.0|
+--------+------------+-----+----------+

+--------+-----+
|Category|Value|
+--------+-----+
|       A| 10.0|
|       B| 15.0|
|       A| 20.0|
|       B| 25.0|
+--------+-----+

‚úÖ Aggregated data written to CSV: data/output\aggregated_data.csv


git init
git add .
git commit -m "Initial commit: Fabric + PySpark project"
git branch -M main
git remote add origin https://github.com/hansenguxd/fabric-pyspark-analysis.git
git push -u origin main