# Creating a SparkSession

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("Sample Retail Analysis").getOrCreate()

# Reading the data into a DataFrame

In [2]:
schema = StructType([StructField("Date", StringType(), True), 
                     StructField("Time", StringType(), True),
                     StructField("City", StringType(), True), 
                     StructField("Product_Cat", StringType(), True),
                     StructField("Sale_Value", FloatType(), True), 
                     StructField("Payment_mode", StringType(), True)
                    ])
data = spark.read.csv("./data/Retail_Sample_Data_Set.txt",sep ="\t",schema=schema)

In [3]:
data.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Time: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product_Cat: string (nullable = true)
 |-- Sale_Value: float (nullable = true)
 |-- Payment_mode: string (nullable = true)



In [4]:
modDf = (data
         .withColumn("Date_mod",to_date("Date","yyyy-MM-dd")).drop(col("Date"))
        )
                                                                    

In [5]:
modDf.printSchema()

root
 |-- Time: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product_Cat: string (nullable = true)
 |-- Sale_Value: float (nullable = true)
 |-- Payment_mode: string (nullable = true)
 |-- Date_mod: date (nullable = true)



In [6]:
modDf.show(5,False)

+-----+----------+-------------------+----------+------------+----------+
|Time |City      |Product_Cat        |Sale_Value|Payment_mode|Date_mod  |
+-----+----------+-------------------+----------+------------+----------+
|09:00|San Jose  |Men's Clothing     |214.05    |Amex        |2012-01-01|
|09:00|Fort Worth|Women's Clothing   |153.57    |Visa        |2012-01-01|
|09:00|San Diego |Music              |66.08     |Cash        |2012-01-01|
|09:00|Pittsburgh|Pet Supplies       |493.51    |Discover    |2012-01-01|
|09:00|Omaha     |Children's Clothing|235.63    |MasterCard  |2012-01-01|
+-----+----------+-------------------+----------+------------+----------+
only showing top 5 rows



### Find How many Product Categories are available

In [7]:
modDf.select("Product_Cat").distinct().count()

18

In [8]:
modDf.select("Product_Cat").distinct().show(truncate=False)

+--------------------+
|Product_Cat         |
+--------------------+
|Children's Clothing |
|Sporting Goods      |
|CDs                 |
|Computers           |
|Consumer Electronics|
|Health and Beauty   |
|Pet Supplies        |
|DVDs                |
|Baby                |
|Crafts              |
|Women's Clothing    |
|Video Games         |
|Books               |
|Music               |
|Men's Clothing      |
|Cameras             |
|Garden              |
|Toys                |
+--------------------+



## Q1 Calculate sales breakdown by product category across all of the stores

In [9]:
(modDf
 .select("Product_Cat","Sale_Value")
 .groupBy("Product_Cat")
 .agg(sum(col("Sale_Value")).alias("Total_Sales"))
 .withColumn("Total_Sales",round("Total_Sales",2))
 .orderBy(col("Total_Sales"),ascending = False)
 .show(truncate=False)
)

+--------------------+-----------+
|Product_Cat         |Total_Sales|
+--------------------+-----------+
|Men's Clothing      |4030.89    |
|Women's Clothing    |3736.87    |
|Books               |3492.8     |
|Crafts              |3258.09    |
|Toys                |3188.18    |
|Consumer Electronics|2963.59    |
|DVDs                |2831.0     |
|Children's Clothing |2778.21    |
|Pet Supplies        |2660.83    |
|CDs                 |2644.51    |
|Cameras             |2591.27    |
|Video Games         |2573.38    |
|Health and Beauty   |2467.32    |
|Music               |2396.4     |
|Computers           |2102.66    |
|Baby                |2034.23    |
|Sporting Goods      |1952.89    |
|Garden              |1882.25    |
+--------------------+-----------+



## Q2. Calculate sales breakdown by store across all of the stores. Assume there is one store per city

In [10]:
modDf.select("City").distinct().count()

89

In [11]:
(modDf
.select("City","Sale_Value")
.groupBy("City")
.agg(sum("Sale_Value").alias("Total_Sales"))
.withColumn("Total_Sales",round("Total_Sales",2))
.orderBy("Total_Sales",ascending=False) 
.show(89,truncate=False) 
)

+---------------+-----------+
|City           |Total_Sales|
+---------------+-----------+
|Omaha          |1811.89    |
|Austin         |1787.88    |
|Chandler       |1648.77    |
|Lubbock        |1517.08    |
|Jacksonville   |1330.18    |
|Pittsburgh     |1271.35    |
|Fresno         |1185.87    |
|Philadelphia   |1166.76    |
|Fort Worth     |1128.14    |
|New Orleans    |1118.08    |
|Boston         |1114.54    |
|Riverside      |1106.01    |
|Houston        |1101.95    |
|Kansas City    |1093.66    |
|Anchorage      |1086.22    |
|Lexington      |1080.47    |
|Albuquerque    |1074.88    |
|Durham         |980.32     |
|Phoenix        |955.31     |
|Seattle        |934.39     |
|Santa Ana      |922.3      |
|Scottsdale     |880.26     |
|Nashville      |765.05     |
|Greensboro     |749.73     |
|Honolulu       |717.58     |
|Lincoln        |712.77     |
|Chesapeake     |676.35     |
|Norfolk        |669.25     |
|Washington     |650.48     |
|Virginia Beach |647.67     |
|Toledo   

## Q3. Find the total sales values across all the stores and the total number of sales. 

In [12]:
(modDf
.select(count("Sale_Value").alias("Number of Sales"),round(sum("Sale_Value"),2).alias("Total_Sales"))
.show())

+---------------+-----------+
|Number of Sales|Total_Sales|
+---------------+-----------+
|            200|   49585.37|
+---------------+-----------+



In [13]:
spark.stop()