In [21]:
from pyspark.sql import *
from pyspark.sql.functions import *
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
                    .appName("Hello Spark") \
                    .master("local[2]") \
                    .getOrCreate()

In [22]:
fire_df = spark.read \
            .format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true").load("spark_transformations_03.csv")    


In [23]:
fire_df.show()

+--------+--------+----+----------+-----+
|     day|    city|temp|wind speed|event|
+--------+--------+----+----------+-----+
|20110014|new york|  23|         6| rain|
|20110014|new york|  24|         7|sunny|
|20110014|new york|  25|         8|rainy|
|20110014|new york|  26|         9| rain|
|20110014|new york|  27|        10| rain|
|20110014|  mumbai|  28|        11|sunny|
|20110014|  mumbai|  29|        12|rainy|
|20110014|  mumbai|  30|        13| snow|
|20110014|  mumbai|  31|        14| snow|
|20110014|  mumbai|  32|        15| rain|
|20110014|  mumbai|  33|        16|sunny|
|20110014|   paris|  34|        17|rainy|
|20110014|   paris|  34|        17|rainy|
|20110014|   paris|  34|        17|rainy|
|20110014|   paris|  34|        17|rainy|
+--------+--------+----+----------+-----+



# Remove the spaces from the column names
# Note : 1. You can create a chain of spark transformation methods  one after the another. 
# 2. Spark transformations returns new dataframes after transforming the old dataframe
# 3. Spark Dataframe is immutable
# 4. Spark data like columns names are case insensitive 

In [24]:
rename_fire_df = fire_df \
                    .withColumnRenamed("wind speed", "windspeed") \
                    .withColumnRenamed("event", "Events")

In [25]:
rename_fire_df.show()

+--------+--------+----+---------+------+
|     day|    city|temp|windspeed|Events|
+--------+--------+----+---------+------+
|20110014|new york|  23|        6|  rain|
|20110014|new york|  24|        7| sunny|
|20110014|new york|  25|        8| rainy|
|20110014|new york|  26|        9|  rain|
|20110014|new york|  27|       10|  rain|
|20110014|  mumbai|  28|       11| sunny|
|20110014|  mumbai|  29|       12| rainy|
|20110014|  mumbai|  30|       13|  snow|
|20110014|  mumbai|  31|       14|  snow|
|20110014|  mumbai|  32|       15|  rain|
|20110014|  mumbai|  33|       16| sunny|
|20110014|   paris|  34|       17| rainy|
|20110014|   paris|  34|       17| rainy|
|20110014|   paris|  34|       17| rainy|
|20110014|   paris|  34|       17| rainy|
+--------+--------+----+---------+------+



In [26]:
rename_fire_df.printSchema()

root
 |-- day: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- temp: integer (nullable = true)
 |-- windspeed: integer (nullable = true)
 |-- Events: string (nullable = true)



# change the data type of any of the columns like Day field to date datatype`

In [35]:
new_df = rename_fire_df.withColumn("day", to_date("day", "yyyy-MM-dd "))

In [36]:
new_df.printSchema()

root
 |-- day: date (nullable = true)
 |-- city: string (nullable = true)
 |-- temp: integer (nullable = true)
 |-- windspeed: integer (nullable = true)
 |-- Events: string (nullable = true)



In [37]:
display(new_df)

DataFrame[day: date, city: string, temp: int, windspeed: int, Events: string]

In [39]:
new_df.show()

+----+--------+----+---------+------+
| day|    city|temp|windspeed|Events|
+----+--------+----+---------+------+
|NULL|new york|  23|        6|  rain|
|NULL|new york|  24|        7| sunny|
|NULL|new york|  25|        8| rainy|
|NULL|new york|  26|        9|  rain|
|NULL|new york|  27|       10|  rain|
|NULL|  mumbai|  28|       11| sunny|
|NULL|  mumbai|  29|       12| rainy|
|NULL|  mumbai|  30|       13|  snow|
|NULL|  mumbai|  31|       14|  snow|
|NULL|  mumbai|  32|       15|  rain|
|NULL|  mumbai|  33|       16| sunny|
|NULL|   paris|  34|       17| rainy|
|NULL|   paris|  34|       17| rainy|
|NULL|   paris|  34|       17| rainy|
|NULL|   paris|  34|       17| rainy|
+----+--------+----+---------+------+



# Querying dataframes : 
# 1. Uisng Sql - you have to convert dataframes into te,porary view then run your sql queries on the views. for eg - 
# new_df.createOrReplaceTempView("view_name")
ql_sql = spark.sql ("""
    select distinct(city) from view_name
""")
display (ql_sql)
# 2. Dataframe transformation approach 
# Dataframe Transformation methods : where(), select(), distinct(), count().
# Dataframe Actions methods : count(), show()
# Dataframe Functions methods : expr()

In [40]:
ql_df = new_df.where("city is not null") \
            .select(expr("city as Distinct_City")) \
            .distinct()
print(ql_df.count())

3


In [41]:
ql_df.show()

+--------+
|    city|
+--------+
|new york|
|   paris|
|  mumbai|
+--------+



In [44]:
ql_df_1 = new_df.where("city is not null And temp >30") \
            .select(expr("city as Distinct_City"), "windspeed", "temp", "events") \
            .distinct() \
            .show()



+-------------+---------+----+------+
|Distinct_City|windspeed|temp|events|
+-------------+---------+----+------+
|        paris|       17|  34| rainy|
|       mumbai|       14|  31|  snow|
|       mumbai|       16|  33| sunny|
|       mumbai|       15|  32|  rain|
+-------------+---------+----+------+



In [54]:
new_df.select(expr("city as Distinct_City"), "windspeed", "temp", "events") \
    .where("city is not null And temp >30") \
    .orderBy("temp", ascending=False) \
    .show()



+-------------+---------+----+------+
|Distinct_City|windspeed|temp|events|
+-------------+---------+----+------+
|        paris|       17|  34| rainy|
|        paris|       17|  34| rainy|
|        paris|       17|  34| rainy|
|        paris|       17|  34| rainy|
|       mumbai|       16|  33| sunny|
|       mumbai|       15|  32|  rain|
|       mumbai|       14|  31|  snow|
+-------------+---------+----+------+

