Creating a spark session

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName('MnM dataset').getOrCreate()

### Read from the CSV and infer the schema

In [4]:
filePath = "/home/karthik/SparkCourse/pyspark notebooks/data/mnm_dataset.csv"

data = spark.read.csv(filePath,header=True,inferSchema=True)

data.printSchema()

root
 |-- State: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Count: integer (nullable = true)



In [5]:
data.show(10,False)

+-----+------+-----+
|State|Color |Count|
+-----+------+-----+
|TX   |Red   |20   |
|NV   |Blue  |66   |
|CO   |Blue  |79   |
|OR   |Blue  |71   |
|WA   |Yellow|93   |
|WY   |Blue  |16   |
|CA   |Yellow|53   |
|WA   |Green |60   |
|OR   |Green |71   |
|TX   |Green |68   |
+-----+------+-----+
only showing top 10 rows



### Aggregate count of all colors and groupBy state and color, orderBy descending order

In [6]:
newData = (data
           .select("State","Color","Count")
           .groupBy("State","Color")
           .agg(count("Count").alias("Total Count"))
           .orderBy("Total Count",ascending=False)
          )

newData.show(10,False)

+-----+------+-----------+
|State|Color |Total Count|
+-----+------+-----------+
|CA   |Yellow|1807       |
|WA   |Green |1779       |
|OR   |Orange|1743       |
|TX   |Green |1737       |
|TX   |Red   |1725       |
|CA   |Green |1723       |
|CO   |Yellow|1721       |
|CA   |Brown |1718       |
|CO   |Green |1713       |
|NV   |Orange|1712       |
+-----+------+-----------+
only showing top 10 rows



### Find the aggregate count for California by filtering on State

In [7]:
calData = (data
           .select("State","Color","Count")
           .filter(col("State") == "CA")
           .groupBy("State","Color")
           .agg(count("Count").alias("Total Count"))
           .orderBy("Total Count",ascending=False)
          )

calData.show(10,False)

+-----+------+-----------+
|State|Color |Total Count|
+-----+------+-----------+
|CA   |Yellow|1807       |
|CA   |Green |1723       |
|CA   |Brown |1718       |
|CA   |Orange|1657       |
|CA   |Red   |1656       |
|CA   |Blue  |1603       |
+-----+------+-----------+

