In [0]:
# Sample Data for `sales_data`
data = [
    (1, 101, "Alice Smith", "North", "Laptop", "2023-01-01", 2, 1200.00, 2400.00),
    (2, 102, "Bob Johnson", "South", "Smartphone", "2023-01-02", 5, 800.00, 4000.00),
    (3, 103, "Charlie Brown", "East", "Tablet", "2023-01-03", 3, 500.00, 1500.00),
    (4, 104, "Diana Prince", "West", "Laptop", "2023-01-04", 1, 1200.00, 1200.00),
    (5, 101, "Alice Smith", "North", "Smartphone", "2023-01-05", 4, 800.00, 3200.00),
    (6, 103, "Charlie Brown", "East", "Laptop", "2023-01-06", 2, 1200.00, 2400.00),
    (7, 102, "Bob Johnson", "South", "Tablet", "2023-01-07", 6, 500.00, 3000.00),
    (8, 104, "Diana Prince", "West", "Smartphone", "2023-01-08", 7, 800.00, 5600.00),
    (9, 101, "Alice Smith", "North", "Tablet", "2023-01-09", 2, 500.00, 1000.00),
    (10, 102, "Bob Johnson", "South", "Laptop", "2023-01-10", 1, 1200.00, 1200.00),
    (11, 102, "Bob Johnson1", "South", "Laptop", "2023-01-11", 1, 1100.00, 1100.00),
]

# Define Schema
columns = ["sale_id", "employee_id", "employee_name", "region", "product", 
           "sale_date", "quantity", "unit_price", "total_amount"]

In [0]:
# Create DataFrame
df = spark.createDataFrame(data, schema=columns)

In [0]:
from pyspark.sql.window import Window
import pyspark.sql.functions as f

In [0]:
newDf = df.withColumn('row_number', f.row_number().over(Window.partitionBy('product').orderBy(f.desc('unit_price'))))

In [0]:
newDF1 = newDf.withColumn('product_rank', f.rank().over(Window.partitionBy('product').orderBy(f.desc('unit_price'))))

In [0]:
display(newDF1.take(10))

In [0]:
df.withColumn("TotalRevenueOfProduct", f.sum('total_amount').over(Window.partitionBy("product").orderBy('total_amount').rowsBetween(-1, 0))).display()

In [0]:
# Define the window specification using rangeBetween
windowSpecRange = Window.partitionBy('product').orderBy('unit_price').rangeBetween(Window.unboundedPreceding, Window.currentRow)

# Define the window specification using rowsBetween
windowSpecRows = Window.partitionBy('product').orderBy('unit_price').rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Create a new DataFrame with the sum of total_amount using rangeBetween
df_with_range_sum = df.withColumn('range_sum', f.sum('total_amount').over(windowSpecRange))

# Create a new DataFrame with the sum of total_amount using rowsBetween
df_with_rows_sum = df.withColumn('rows_sum', f.sum('total_amount').over(windowSpecRows))

display(df_with_range_sum)
display(df_with_rows_sum)

In [0]:
df.withColumn('leadVal', f.lead('total_amount', 1).over(Window.partitionBy('product').orderBy(f.desc('unit_price')))).display()

In [0]:
df.withColumn('lagVal', f.lag('total_amount', 1).over(Window.partitionBy('product').orderBy(f.desc('unit_price')))).display()

In [0]:
newDF2 = newDf1.withColumn('product_dense_rank', f.rank().over(Window.partitionBy('product').orderBy(f.desc('unit_price'))))

In [0]:
productWindow = Window.partitionBy('product').orderBy(f.desc('unit_price'))

In [0]:
productRankDF = df.select(
    [
        f.col('product'),
        f.col('unit_price'),
        f.dense_rank().over(productWindow).alias('product_rank')
    ]
)

In [0]:
productRankDF.display()

In [0]:
regionWindow = Window.partitionBy("region").orderBy(f.desc('total_amount'))

In [0]:
# 1. Rank Sales by Region
# columns = ["sale_id", "employee_id", "employee_name", "region", "product", 
        #    "sale_date", "quantity", "unit_price", "total_amount"]
regionRankDF = df.select(
    [
        f.col('sale_id'),
        f.col('employee_id'),
        f.col('region'),
        f.col('product'),
        f.col('total_amount'),
        f.dense_rank().over(regionWindow).alias('region_rank')
    ]
)

display(regionRankDF)


In [0]:
empWindow = Window.partitionBy("employee_id").orderBy(f.desc('total_amount'))

In [0]:
# 2. Calculate Running Total by Employee
df = df.withColumn('employee_total_amount', f.sum('total_amount').over(empWindow))
df.display()