In [4]:
pip install pyspark


Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=9e66783d05780aafca4cc5e679e1720a14aac011baebe273b9295be6bbe782d9
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [44]:
 import pandas as pd
 from datetime import datetime

   # Sample sales data

data = {
       "TransactionID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
       "CustomerID": [101, 102, 103, 101, 104, 102, 103, 104, 101, 105],
       "ProductID": [501, 502, 501, 503, 504, 502, 503, 504, 501, 505],
       "Quantity": [2, 1, 4, 3, 1, 2, 5, 1, 2, 1],
       "Price": [150.0, 250.0, 150.0, 300.0, 450.0, 250.0, 300.0, 450.0, 150.0, 550.0],
       "Date": [
           datetime(2024, 9, 1),
           datetime(2024, 9, 1),
           datetime(2024, 9, 2),
           datetime(2024, 9, 2),
           datetime(2024, 9, 3),
           datetime(2024, 9, 3),
           datetime(2024, 9, 4),
           datetime(2024, 9, 4),
           datetime(2024, 9, 5),
           datetime(2024, 9, 5)
       ]
   }

     # Create a DataFrame
df = pd.DataFrame(data)

   # Save the DataFrame to a CSV file
df.to_csv('sales_data.csv', index=False)

print("Sample sales dataset has been created and saved as 'sales_data.csv'.")


Sample sales dataset has been created and saved as 'sales_data.csv'.


In [49]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Sales Dataset Analysis") \
    .getOrCreate()

# Load the CSV file into a PySpark DataFrame
df = spark.read.csv('sales_data.csv', header=True)

# Display the first few rows
df.show()


+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
|            6|       102|      502|       2|250.0|2024-09-03|
|            7|       103|      503|       5|300.0|2024-09-04|
|            8|       104|      504|       1|450.0|2024-09-04|
|            9|       101|      501|       2|150.0|2024-09-05|
|           10|       105|      505|       1|550.0|2024-09-05|
+-------------+----------+---------+--------+-----+----------+



In [57]:
# 1. printing schema
df.printSchema()

# 2.First 5 rows
df.show(5)

# 3.Summary statics
df.describe(["Quantity","Price"]).show()



root
 |-- TransactionID: string (nullable = true)
 |-- CustomerID: string (nullable = true)
 |-- ProductID: string (nullable = true)
 |-- Quantity: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Date: string (nullable = true)

+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
+-------------+----------+---------+--------+-----+----------+
only showing top 5 rows

+-------+-----------------+-----------------+
|summary|         Quantity|            Price|
+-------+-----------------+-----------------+
|  count|              

In [63]:
# DATA TRANSFORMATION

# 1 .new column TotalSales
df = df.withColumn('TotalSales', col('Quantity') * col('Price'))
print("Data after adding TotalSales column:")
df.show()

# 2. group by productId and total sales
total_sales_df = df.groupBy('ProductID').sum('TotalSales').withColumnRenamed('sum(TotalSales)', 'TotalSales')
print("Total Sales by Product ID:")
total_sales_df.show()

# 3. Top selling product
top_selling_product = total_sales_df.orderBy(col('TotalSales').desc()).limit(1)
print("Top Selling Product:")
top_selling_product.show()

# 4. Total sales by date
total_sales_by_date = df.groupBy('Date').sum('TotalSales').withColumnRenamed('sum(TotalSales)', 'TotalSales')
print("Total Sales by Date:")
total_sales_by_date.show()

# Filtering total sales above 500
filtered_total_sales = total_sales_df.filter(col('TotalSales') > 500)
print("Total Sales Above 500:")
filtered_total_sales.show()

Data after adding TotalSales column:
+-------------+----------+---------+--------+-----+----------+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|TotalSales|
+-------------+----------+---------+--------+-----+----------+----------+
|            1|       101|      501|       2|150.0|2024-09-01|     300.0|
|            2|       102|      502|       1|250.0|2024-09-01|     250.0|
|            3|       103|      501|       4|150.0|2024-09-02|     600.0|
|            4|       101|      503|       3|300.0|2024-09-02|     900.0|
|            5|       104|      504|       1|450.0|2024-09-03|     450.0|
|            6|       102|      502|       2|250.0|2024-09-03|     500.0|
|            7|       103|      503|       5|300.0|2024-09-04|    1500.0|
|            8|       104|      504|       1|450.0|2024-09-04|     450.0|
|            9|       101|      501|       2|150.0|2024-09-05|     300.0|
|           10|       105|      505|       1|550.0|2024-09-05|     550.0|
+

In [65]:
# Additional Challenge

# Identifying repeat customers
repeat_customers = df.groupBy('CustomerID').count().withColumnRenamed('count',"Total_Count").filter(col('Total_count') > 1)
print("Repeat Customers:")
repeat_customers.show()

# Average sale price per product
avg_sale_price_per_product = df.groupBy('ProductID').avg('Price').withColumnRenamed('avg(Price)', 'AvgPrice')
print("Average Sale Price per Product:")
avg_sale_price_per_product.show

Repeat Customers:
+----------+-----------+
|CustomerID|Total_Count|
+----------+-----------+
|       101|          3|
|       104|          2|
|       102|          2|
|       103|          2|
+----------+-----------+



AnalysisException: "Price" is not a numeric column. Aggregation function can only be applied on a numeric column.