# **Spark Coding Assessment**
##### by Esaq

### 1. Create a SparkSession

In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat_ws, col, max, sum, min, avg

spark = SparkSession.builder.appName("PySpark Transformations and Actions").getOrCreate()

### 2. Load the orders.csv file into a DataFrame

In [2]:
orders_df = spark.read.csv("orders.csv", header=True, inferSchema=True)

### **PERFORMING TRANSFORMATIONS**
### 3. Select specific columns
### We create a new DataFrame that only contains 'cust_fname', 'cust_lname', and 'cust_order'.

In [4]:
orders_df.select("cust_fname", "cust_lname", "cust_order").show()

+----------+----------+----------+
|cust_fname|cust_lname|cust_order|
+----------+----------+----------+
|      john|       doe|         5|
|      jane|     smith|         8|
|   micheal|   jhonson|         3|
|      abhi|   wiliams|         1|
|       ram|     brown|         4|
|     emily|  anderson|         2|
|   william|     jones|        10|
|     susan|     davis|         7|
|     david|    miller|         9|
|      sara|     moore|         2|
|     james|    tailor|         5|
|    olivia|    wilson|         3|
|    robert|     evans|        11|
|      emma|    thomas|        29|
|    mathew|     haris|         5|
|  isabella|     white|         6|
|    joseph|    martin|         4|
|     grace|       lee|         5|
|chrisopher|      basa|         8|
|       ava|    joesph|         3|
+----------+----------+----------+



### 4. Filter rows based on a condition
### We create a new DataFrame with only 'active' customers.

In [6]:
orders_df.filter(orders_df.cust_status == "active").show()

+-------+----------+----------+----------+-----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|
+-------+----------+----------+----------+-----------+
|      1|      john|       doe|         5|     active|
|      2|      jane|     smith|         8|     active|
|      4|      abhi|   wiliams|         1|     active|
|      6|     emily|  anderson|         2|     active|
|      7|   william|     jones|        10|     active|
|      9|     david|    miller|         9|     active|
|     13|    robert|     evans|        11|     active|
|     14|      emma|    thomas|        29|     active|
|     18|     grace|       lee|         5|     active|
|     20|       ava|    joesph|         3|     active|
+-------+----------+----------+----------+-----------+



### 5. Group by 'cust_status' and count
### Here, groupBy() is a transformation, but we need an aggregation ACTION (like count())
### to trigger the computation and return a result.

In [7]:
orders_df.groupBy("cust_status").count().show()

+-----------+-----+
|cust_status|count|
+-----------+-----+
|     active|   10|
|   inactive|   10|
+-----------+-----+



### 6. Sort the DataFrame
### We sort the DataFrame by 'cust_order' in descending order.

In [10]:
orders_df.orderBy(orders_df.cust_order.asc()).show()

+-------+----------+----------+----------+-----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|
+-------+----------+----------+----------+-----------+
|      4|      abhi|   wiliams|         1|     active|
|      6|     emily|  anderson|         2|     active|
|     10|      sara|     moore|         2|   inactive|
|      3|   micheal|   jhonson|         3|   inactive|
|     12|    olivia|    wilson|         3|   inactive|
|     20|       ava|    joesph|         3|     active|
|      5|       ram|     brown|         4|   inactive|
|     17|    joseph|    martin|         4|   inactive|
|      1|      john|       doe|         5|     active|
|     11|     james|    tailor|         5|   inactive|
|     15|    mathew|     haris|         5|   inactive|
|     18|     grace|       lee|         5|     active|
|     16|  isabella|     white|         6|   inactive|
|      8|     susan|     davis|         7|   inactive|
|      2|      jane|     smith|         8|     active|
|     19|c

### 7. Limit returns first 'n' no. of rows
### We return the first 5 rows from the orders dataframe

In [11]:
orders_df.limit(5).show()

+-------+----------+----------+----------+-----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|
+-------+----------+----------+----------+-----------+
|      1|      john|       doe|         5|     active|
|      2|      jane|     smith|         8|     active|
|      3|   micheal|   jhonson|         3|   inactive|
|      4|      abhi|   wiliams|         1|     active|
|      5|       ram|     brown|         4|   inactive|
+-------+----------+----------+----------+-----------+



### Loading a new CSV file for join operation in Transformation

In [12]:
order_details_df = spark.read.csv("order_details.csv", header=True, inferSchema=True)
order_details_df.show()

+--------+-------+------------+--------+-----+----------+
|order_id|cust_id|product_name|quantity|price|order_date|
+--------+-------+------------+--------+-----+----------+
|    1000|      1|     Monitor|       5| 2930|2025-06-03|
|    1001|      1|      Laptop|       3| 2211|2025-05-14|
|    1002|      2|       Mouse|       1|   73|2024-08-28|
|    1003|      3|    Keyboard|       2| 1124|2025-05-15|
|    1004|      4|  Headphones|       3|  600|2025-04-26|
|    1005|      4|       Mouse|       3| 2967|2025-02-17|
|    1006|      5|  Headphones|       2| 1316|2024-11-07|
|    1007|      5|     Monitor|       4|  228|2024-09-09|
|    1008|      6|       Mouse|       5| 4980|2025-05-30|
|    1009|      6|  Headphones|       5| 2240|2025-04-01|
|    1010|      6|      Laptop|       3| 2523|2024-12-28|
|    1011|      7|      Tablet|       1|  437|2025-05-24|
|    1012|      7|  Headphones|       4| 2436|2024-12-07|
|    1013|      7|       Mouse|       1|  534|2025-07-23|
|    1014|    

### 8. Joining two dataframes
### We join orders and order details dataframe using a common key "cust_id"

In [36]:
df_joined = orders_df.join(order_details_df, on="cust_id", how="inner")

In [14]:
df_joined.show()

+-------+----------+----------+----------+-----------+--------+------------+--------+-----+----------+
|cust_id|cust_fname|cust_lname|cust_order|cust_status|order_id|product_name|quantity|price|order_date|
+-------+----------+----------+----------+-----------+--------+------------+--------+-----+----------+
|      1|      john|       doe|         5|     active|    1000|     Monitor|       5| 2930|2025-06-03|
|      1|      john|       doe|         5|     active|    1001|      Laptop|       3| 2211|2025-05-14|
|      2|      jane|     smith|         8|     active|    1002|       Mouse|       1|   73|2024-08-28|
|      3|   micheal|   jhonson|         3|   inactive|    1003|    Keyboard|       2| 1124|2025-05-15|
|      4|      abhi|   wiliams|         1|     active|    1004|  Headphones|       3|  600|2025-04-26|
|      4|      abhi|   wiliams|         1|     active|    1005|       Mouse|       3| 2967|2025-02-17|
|      5|       ram|     brown|         4|   inactive|    1006|  Headphon

### **PERFORMING ACTIONS**
### 9. Get the total number of records from the joined DataFrame
### count() returns a single value to the driver.

In [37]:
total_joined_records = df_joined.count()
print("Total records in joined table is ", total_joined_records)

Total records in joined table is  39


### 10. Take the first few rows and print
### take() returns a list of Row objects to the driver.
### This should be used on small datasets or after filtering.

In [16]:
first_two_rows = orders_df.take(2)
print(first_two_rows)

[Row(cust_id=1, cust_fname='john', cust_lname='doe', cust_order=5, cust_status='active'), Row(cust_id=2, cust_fname='jane', cust_lname='smith', cust_order=8, cust_status='active')]


### 11. Collect all active customers by Filtering
### collect() brings all the data to the driver. We should only do this on small datasets after filtering.
### We'll first filter() and then collect().

In [20]:
active_cust_list = orders_df.filter(col("cust_status")=="active").collect()
print("No. of active customers is ", len(active_cust_list))
print("First row of active customer list is ", active_cust_list[0])

No. of active customers is  10
First row of active customer list is  Row(cust_id=1, cust_fname='john', cust_lname='doe', cust_order=5, cust_status='active')


### 12. Getting the max element from the dataframe
### max() function should be imported before use, it can be used while performing groupBy function as well.

In [27]:
orders_df.select(max('cust_order')).show()

+---------------+
|max(cust_order)|
+---------------+
|             29|
+---------------+



### 13. Getting the total amount spent on orders by each customer.
#### We are using groupBy(), agg() aggregation function, sum(), alias() to give temporary column name, orderBy() to sort by Customer ID

In [38]:
order_details_df.groupBy("cust_id").agg(sum("price")).alias("total_spent").orderBy("cust_id").show()

+-------+----------+
|cust_id|sum(price)|
+-------+----------+
|      1|      5141|
|      2|        73|
|      3|      1124|
|      4|      3567|
|      5|      1544|
|      6|      9743|
|      7|      3407|
|      8|      1539|
|      9|      3026|
|     10|      1581|
|     11|      2617|
|     12|      1512|
|     13|      3206|
|     14|      2782|
|     15|      3973|
|     16|      1050|
|     17|       252|
|     18|      1533|
|     19|      2127|
|     20|      6188|
+-------+----------+

