In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

data = [
    (1, "Alice", 200.50),
    (2, "Bob", 150.75),
    (3, "Charlie", 300.60),
    (1, "Alice", 100.20),
    (2, "Bob", 50.80),
    (3, "Charlie", 120.30),
    (4, "David", 400.00)
]

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("purchase_amount", DoubleType(), True)
])

df1 = spark.createDataFrame(data, schema)

In [0]:
from pyspark.sql.functions import count,col,sum

In [0]:
 # Aggregate total purchase amount per customer.
agg_df = df1.groupBy("customer_id", "customer_name").agg(sum("purchase_amount").alias("total_amount")).orderBy(col("total_amount").desc())

display(agg_df)

In [0]:
# Rank customers based on the highest total purchase.
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

w = Window.orderBy(col('total_amount').desc())

rank_df = agg_df.withColumn("rank", rank().over(w)).select('customer_id','customer_name','total_amount')

display(rank_df)


In [0]:
# Fetch the top 3 customers.
top3_df = rank_df.filter(col("rank") <= 3)
display(top3_df)

In [0]:
data = [
    (1, "Alice", "IT", 70000),
    (2, "Bob", "HR", 50000),
    (3, "Charlie", "IT", 80000),
    (4, "David", "Finance", 90000),
    (5, "Eve", "HR", 55000),
    (6, "Frank", "IT", 72000),
]

schema = StructType([
    StructField("emp_id", IntegerType(), True),
    StructField("emp_name", StringType(), True),
    StructField("department", StringType(), True),
    StructField("salary", IntegerType(), True)
])

df2 = spark.createDataFrame(data, schema)
df2.show()

In [0]:
# Calculate the average salary per department.
from pyspark.sql.functions import avg

avg_sal_df = df2.groupBy(col('department')).agg(avg(col('salary')).alias('avg_salary'))

display(avg_sal_df)

In [0]:
# Identify employees earning above their department’s average salary.

final_df = df2.join(avg_sal_df, 'department','inner').filter(col('salary') > col('avg_salary')).select('emp_id','emp_name','department','salary','avg_salary')

display(final_df)

In [0]:
from pyspark.sql.types import DateType, StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import col

data = [
    (101, "2024-01-10", "2024-01-12"),
    (102, "2024-02-15", "2024-02-18"),
    (103, "2024-03-01", "2024-03-03"),
    (104, "2024-01-05", "2024-01-07"),
    (105, "2024-02-20", "2024-02-25"),
]

schema = StructType([
    StructField("order_id", IntegerType(), True),
    StructField("order_date", StringType(), True),
    StructField("delivery_date", StringType(), True)
])

df = spark.createDataFrame(data, schema)
df = df.withColumn("order_date", col("order_date").cast(DateType()))\
       .withColumn("delivery_date", col("delivery_date").cast(DateType()))
df.show()

In [0]:
# Calculate the delay for each order.
delay_df = df.withColumn("delay", col("delivery_date") - col("order_date"))

In [0]:
# Determine the average delay per month.
from pyspark.sql.functions import month, avg

delay_df.groupBy(month(col("order_date"))).agg(avg(col("delay"))).show(truncate=False)

In [0]:
from pyspark.sql.types import TimestampType,StringType,StructField,StructType,IntegerType,DoubleType
from pyspark.sql.functions import col

data = [
    (1, "Alice", "2024-03-20 10:15:00", 150.00),
    (1, "Alice", "2024-03-20 11:45:00", 200.00),
    (1, "Alice", "2024-03-20 14:30:00", 50.00),
    (1, "Alice", "2024-03-20 17:10:00", 300.00),  # Fraudulent (4 transactions)
    (2, "Bob", "2024-03-21 09:00:00", 100.00),
    (2, "Bob", "2024-03-21 12:30:00", 50.00),
    (3, "Charlie", "2024-03-22 08:15:00", 75.00),
    (3, "Charlie", "2024-03-22 18:45:00", 125.00),
]

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("transaction_time", StringType(), True),
    StructField("amount", DoubleType(), True)
])

emp_df = spark.createDataFrame(data, schema)\
    .withColumn("transaction_time", col("transaction_time").cast(TimestampType()))

emp_df.show(truncate=False)

In [0]:
from pyspark.sql.functions import to_date, count

final_df = emp_df.groupBy('customer_id','customer_name').agg(count('transaction_time').alias('total_transactions')).filter(col('total_transactions') > 3)

display(final_df)

In [0]:
from pyspark.sql.types import DateType, StringType, IntegerType, StructType, StructField

data = [
    (1, "Alice", "2023-12-01"),
    (2, "Bob", "2024-02-15"),
    (3, "Charlie", "2023-11-20"),
    (4, "David", "2024-01-10"),
    (5, "Eve", "2023-09-05"),
]

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("last_purchase_date", StringType(), True)
])

emp1_df = spark.createDataFrame(data, schema)\
    .withColumn("last_purchase_date", col("last_purchase_date").cast(DateType()))

emp1_df.show()

In [0]:
from pyspark.sql.functions import current_date, months_between

# Calculate months since last purchase
df_churn = emp1_df.withColumn("months_since_purchase", months_between(current_date(), "last_purchase_date"))

# Identify customers who haven't made a purchase in the last 3 months
df_churned_customers = df_churn.filter(df_churn.months_since_purchase > 3)

df_churned_customers.show()

In [0]:
from pyspark.sql.functions import year, month

data = [
    (101, "Laptop", "2024-01-15", 5),
    (102, "Phone", "2024-01-20", 8),
    (103, "Tablet", "2024-01-25", 3),
    (104, "Headphones", "2024-02-05", 10),
    (105, "Monitor", "2024-02-10", 6),
    (106, "Laptop", "2024-02-15", 12),
    (107, "Phone", "2024-02-20", 9),
    (108, "Tablet", "2024-03-01", 4),
    (109, "Monitor", "2024-03-05", 7),
    (110, "Phone", "2024-03-10", 15),
]

schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("sale_date", StringType(), True),
    StructField("units_sold", IntegerType(), True)
])

sales_df = spark.createDataFrame(data, schema)\
    .withColumn("sale_date", col("sale_date").cast(DateType()))

sales_df.show()

In [0]:
# Find the top 3 most selling products each month using window functions.

from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, sum

sales_df_agg = sales_df.groupBy('product_name', month('sale_date').alias('month')).agg(sum(col('units_sold')).alias('total_sold'))

w = Window.partitionBy('month').orderBy(col('total_sold').desc())

sales_final_df = sales_df_agg.withColumn('rank', rank().over(w)).filter(col('rank')<=3)

display(sales_final_df)

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# Extract year and month
df_with_month = sales_df.withColumn("month", month("sale_date"))

# Define a window partitioned by month and ordered by units_sold descending
window_spec = Window.partitionBy("month").orderBy(sales_df.units_sold.desc())

# Rank products within each month
df_ranked = df_with_month.withColumn("rank", rank().over(window_spec))

# Get the top 3 products per month
df_top_3_products = df_ranked.filter(df_ranked.rank <= 3)

df_top_3_products.show()

In [0]:
data = [
    (1, "Alice", "2023-05-10"),
    (2, "Bob", "2024-02-15"),
    (3, "Charlie", "2023-07-20"),
    (4, "David", "2023-12-25"),
    (5, "Eve", "2024-01-05"),
    (6, "Frank", "2023-08-30"),
]

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("last_purchase_date", StringType(), True)
])

emp2_df = spark.createDataFrame(data, schema)\
    .withColumn("last_purchase_date", col("last_purchase_date").cast(DateType()))

emp2_df.show()

In [0]:
# Identify customers who made a purchase last year (2023) but have not made one in the current year (2024).

from pyspark.sql.functions import year

yearly_df = emp2_df.withColumn('year',year(col('last_purchase_date')))

final_df = yearly_df.filter(col('year') == 2023).join(yearly_df.filter(col('year') == 2024), on='customer_id', how='leftanti')
final_df.show()

In [0]:
data = [
    (101, "Laptop", 1200, 500, 50),   # High demand, low stock
    (102, "Phone", 800, 200, 150),    # Normal demand
    (103, "Tablet", 300, 80, 200),    # Low demand, high stock
    (104, "Headphones", 150, 400, 50),  # High demand, low stock
    (105, "Monitor", 400, 100, 120),  # Normal demand
]

schema = StructType([
    StructField("product_id", IntegerType(), True),
    StructField("product_name", StringType(), True),
    StructField("price", IntegerType(), True),
    StructField("demand", IntegerType(), True),
    StructField("inventory", IntegerType(), True)
])

sell_df = spark.createDataFrame(data, schema)
sell_df.show()


In [0]:
from pyspark.sql.functions import when

final_df = sell_df.withColumn('adjusted_price',when((col('demand')>300) & (col('inventory')<100),col('price')*1.10)\
                                        .when((col('demand')<100) & (col('inventory')>150),col('price')*0.9)\
                                        .otherwise(col('price')))

display(final_df)                   

In [0]:
df_pricing = sell_df.withColumn("adjusted_price",
    when((sell_df.demand > 300) & (sell_df.inventory < 100), sell_df.price * 1.1)
    .when((sell_df.demand < 100) & (sell_df.inventory > 150), sell_df.price * 0.9)
    .otherwise(sell_df.price))

df_pricing.show()

In [0]:
from pyspark.sql.types import TimestampType, DoubleType, IntegerType, StringType, StructType, StructField
from pyspark.sql.functions import col

data = [
    (1, "Alice", "2024-03-20 10:15:00", 1500.00, "New York"),
    (1, "Alice", "2024-03-20 10:45:00", 1600.00, "Los Angeles"),  # Suspicious
    (2, "Bob", "2024-03-21 12:30:00", 500.00, "Chicago"),
    (3, "Charlie", "2024-03-22 14:00:00", 3000.00, "Miami"),
    (3, "Charlie", "2024-03-22 14:50:00", 3200.00, "Houston"),  # Suspicious
]

schema = StructType([
    StructField("customer_id", IntegerType(), True),
    StructField("customer_name", StringType(), True),
    StructField("order_time", StringType(), True),
    StructField("order_amount", DoubleType(), True),
    StructField("location", StringType(), True)
])

short_df = spark.createDataFrame(data, schema)\
    .withColumn("order_time", col("order_time").cast(TimestampType()))

short_df.show()

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import lag, expr, unix_timestamp

w = Window.partitionBy('customer_id').orderBy(col('order_time'))

new_df = short_df.withColumn('delay',lag(col('order_time')).over(w))

new_df = new_df.withColumn('delay',unix_timestamp(col('order_time')) - unix_timestamp(col('delay')))

df_suspicious = new_df.filter((col('delay')<3600) & (col('location') != lag(col('location')).over(w)))

display(new_df)
