<div style="line-height:1.2;">

<h1 style="color:#0FCBC6; margin-bottom: 0.2em;"> PySpark 0: Dataframes and Queries </h1>
</div>

<div style="margin-top: 10px;">
<span style="display: inline-block;">
    <h3 style="color: lightblue; display: inline; margin-bottom: 0;">Keywords:</h3>  SparkSession + spark.createDataFrame() + globals + IntegerType + withColumn()
</span>
</div>
<br>
<div style="margin-top: -10px;\">
<div style="line-height:1.2\">
<span style="display: inline-block;\">
    <h3 style="color: red; display: inline;\">Notes:</h3> Check "../Tutorials_Machine_Learning/Machine_Learning_guide/PySpark_tutorials/" for other PySpark scripts.
</span>
</div>
</div>

In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, expr, lit, when, udf
from pyspark.sql.functions import collect_list
from pyspark.sql.types import IntegerType

In [2]:
# create SparkSession
spark = SparkSession.builder.appName('store').getOrCreate()

# create store dataframe
store_data = [('store_1', 'New York'), ('store_2', 'Los Angeles'), ('store_3', 'Chicago')]

store_schema = ['store_id', 'location']

store = spark.createDataFrame(store_data, store_schema)
store.show()

23/08/28 11:24:49 WARN Utils: Your hostname, hpmint resolves to a loopback address: 127.0.1.1; using 192.168.1.81 instead (on interface eno1)
23/08/28 11:24:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/28 11:24:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

+--------+-----------+
|store_id|   location|
+--------+-----------+
| store_1|   New York|
| store_2|Los Angeles|
| store_3|    Chicago|
+--------+-----------+



In [3]:
# create inventory dataframe
inventory_data = [('product_1', 'store_1', 10),
                    ('product_2', 'store_1', 5),
                    ('product_1', 'store_2', 20),
                    ('product_2', 'store_2', 15),
                    ('product_3', 'store_3', 8)]

inventory_schema = ['product_id', 'store_id', 'quantity']

inventory = spark.createDataFrame(inventory_data, inventory_schema)
inventory.show()

+----------+--------+--------+
|product_id|store_id|quantity|
+----------+--------+--------+
| product_1| store_1|      10|
| product_2| store_1|       5|
| product_1| store_2|      20|
| product_2| store_2|      15|
| product_3| store_3|       8|
+----------+--------+--------+



In [4]:
# create storage dataframe
storage_data = [('product_1', 'A1'),
                ('product_2', 'A1'),
                ('product_1', 'B1'),
                ('product_2', 'B1'),
                ('product_3', 'B2')]

storage_schema = ['product_id', 'location']

storage = spark.createDataFrame(storage_data, storage_schema)
storage.show()

+----------+--------+
|product_id|location|
+----------+--------+
| product_1|      A1|
| product_2|      A1|
| product_1|      B1|
| product_2|      B1|
| product_3|      B2|
+----------+--------+



In [5]:
# df_list = [store, inventory, storage]
df_list = ["store", "inventory", "storage"]

for d in df_list:
    df = globals()[d]
    df.createOrReplaceTempView(d)
    #print(d)
    print(df)

#df

DataFrame[store_id: string, location: string]
DataFrame[product_id: string, store_id: string, quantity: bigint]
DataFrame[product_id: string, location: string]


<h3 style="color:#0FCBC6"> Queries </h3>

In [6]:
# Get the total quantity of each product in each store
result = spark.sql("SELECT s.location, i.product_id, SUM(i.quantity) as total_quantity FROM store s JOIN inventory i ON s.store_id = i.store_id GROUP BY s.location, i.product_id")
result.show()



+-----------+----------+--------------+
|   location|product_id|total_quantity|
+-----------+----------+--------------+
|Los Angeles| product_1|            20|
|Los Angeles| product_2|            15|
|   New York| product_2|             5|
|    Chicago| product_3|             8|
|   New York| product_1|            10|
+-----------+----------+--------------+



                                                                                

In [7]:
""" Get the total quantity of each product in each store using DataFrame operations. """
result_df = store.join(inventory, store.store_id == inventory.store_id).groupBy(store.location, inventory.product_id).agg(sum(inventory.quantity).alias("total_quantity"))
result_df.show()



+-----------+----------+--------------+
|   location|product_id|total_quantity|
+-----------+----------+--------------+
|Los Angeles| product_1|            20|
|Los Angeles| product_2|            15|
|   New York| product_2|             5|
|    Chicago| product_3|             8|
|   New York| product_1|            10|
+-----------+----------+--------------+



                                                                                

<h3 style="color:#0FCBC6"> Add new columns </h3>

#### => Create a new table

In [23]:
costs_list = [15, 25, 10, 30, 8]

# Create a DataFrame with product_id and cost columns
costs_data = list(zip([item[0] for item in inventory_data], costs_list))
costs_data

[('product_1', 15),
 ('product_2', 25),
 ('product_1', 10),
 ('product_2', 30),
 ('product_3', 8)]

In [21]:
# Create a dictionary of costs for each product, instead of a list
costs = {"product_1": 15, "product_2": 25, "product_3": 10, "product_4": 30, "product_5": 8}

# Define a UDF to map product_id to cost
def map_cost(product_id):
    return costs.get(product_id, None)

# Register the UDF
map_cost_udf = udf(map_cost, IntegerType())

# Add the "cost" column using the UDF
inventory_with_costs = inventory.withColumn("cost", map_cost_udf(col("product_id")))

inventory_with_costs.show()

                                                                                

+----------+--------+--------+----+
|product_id|store_id|quantity|cost|
+----------+--------+--------+----+
| product_1| store_1|      10|  15|
| product_2| store_1|       5|  25|
| product_1| store_2|      20|  15|
| product_2| store_2|      15|  25|
| product_3| store_3|       8|  10|
+----------+--------+--------+----+



                                                                                

In [20]:
""" another solution """
# Add the "cost" column based on the product IDs using 'when'
inventory_with_costs = inventory.withColumn("cost",
    when(col("product_id") == "product_1", costs_list[0])
    .when(col("product_id") == "product_2", costs_list[1])
    .when(col("product_id") == "product_3", costs_list[2])
    .when(col("product_id") == "product_4", costs_list[3])
    .when(col("product_id") == "product_5", costs_list[4])
    .otherwise(None)
)

inventory_with_costs.show()

+----------+--------+--------+----+
|product_id|store_id|quantity|cost|
+----------+--------+--------+----+
| product_1| store_1|      10|  15|
| product_2| store_1|       5|  25|
| product_1| store_2|      20|  15|
| product_2| store_2|      15|  25|
| product_3| store_3|       8|  10|
+----------+--------+--------+----+



#### => Add new column in place

In [12]:
# Create a list of conditions and corresponding costs using the 'when' function
conditions = [col("product_id") == item[0] for item in inventory_data]
cost_expr = [(cond, cost) for cond, cost in zip(conditions, costs_list)]

# Add the "cost" column to the inventory DataFrame
inventory = inventory.withColumn("cost", 
    when(cost_expr[0][0], cost_expr[0][1])
    .when(cost_expr[1][0], cost_expr[1][1])
    .when(cost_expr[2][0], cost_expr[2][1])
    .when(cost_expr[3][0], cost_expr[3][1])
    .when(cost_expr[4][0], cost_expr[4][1])
    .otherwise(None)
)

inventory.show()

+----------+--------+--------+----+
|product_id|store_id|quantity|cost|
+----------+--------+--------+----+
| product_1| store_1|      10|  15|
| product_2| store_1|       5|  25|
| product_1| store_2|      20|  15|
| product_2| store_2|      15|  25|
| product_3| store_3|       8|   8|
+----------+--------+--------+----+



In [25]:
# Add a column to the inventory DataFrame to calculate the value of store related to each product
result_inventory = inventory.withColumn("value", inventory.quantity * inventory.cost)  
result_inventory.show()

+----------+--------+--------+----+-----+
|product_id|store_id|quantity|cost|value|
+----------+--------+--------+----+-----+
| product_1| store_1|      10|  15|  150|
| product_2| store_1|       5|  25|  125|
| product_1| store_2|      20|  15|  300|
| product_2| store_2|      15|  25|  375|
| product_3| store_3|       8|   8|   64|
+----------+--------+--------+----+-----+



In [30]:
# Total asset of each store
inventory_with_value = inventory.withColumn("value", expr("quantity * cost"))  
total_inventory_value = inventory_with_value.groupBy("store_id").agg(expr("sum(value) as total_amount_of_value"))
total_inventory_value.show()

+--------+---------------------+
|store_id|total_amount_of_value|
+--------+---------------------+
| store_1|                  275|
| store_2|                  675|
| store_3|                   64|
+--------+---------------------+



In [15]:
# List Products in Each Storage Location
storage_products = storage.join(inventory, "product_id").groupBy("location").agg(expr("collect_list(product_id) as products"))
storage_products.show()



+--------+--------------------+
|location|            products|
+--------+--------------------+
|      B2|         [product_3]|
|      B1|[product_1, produ...|
|      A1|[product_1, produ...|
+--------+--------------------+



                                                                                

In [16]:
# Find Stores with Low Inventory
low_inventory_stores = inventory.filter(col("quantity") < 10).select("store_id", "product_id", "quantity")
low_inventory_stores.show()

+--------+----------+--------+
|store_id|product_id|quantity|
+--------+----------+--------+
| store_1| product_2|       5|
| store_3| product_3|       8|
+--------+----------+--------+



In [17]:
# Count Products in Each Store
products_per_store = inventory.groupBy("store_id").agg(expr("count(distinct product_id) as num_products"))
products_per_store.show()

+--------+------------+
|store_id|num_products|
+--------+------------+
| store_2|           2|
| store_3|           1|
| store_1|           2|
+--------+------------+



In [39]:
# Calculate Average Inventory Quantity
avg_inventory_per_product = inventory.groupBy("product_id").agg(expr("avg(quantity) as avg_quantity"))
avg_inventory_per_product.show()

+----------+------------+
|product_id|avg_quantity|
+----------+------------+
| product_1|        15.0|
| product_2|        10.0|
| product_3|         8.0|
+----------+------------+



In [38]:
# Storage location
storage_products_info = storage.join(inventory, "product_id") \
    .groupBy("product_id") \
    .agg(collect_list("location").alias("storage_locations"))

storage_products_info.show()

[Stage 72:>                                                         (0 + 4) / 4]

+----------+-----------------+
|product_id|storage_locations|
+----------+-----------------+
| product_1| [A1, A1, B1, B1]|
| product_2| [A1, A1, B1, B1]|
| product_3|             [B2]|
+----------+-----------------+



                                                                                