In [1]:
from pyspark.sql import SparkSession



### Creating APP

In [2]:
spark = SparkSession.builder \
    .master("local") \
    .appName("Bangladesh India Cricket") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

##### Reading dataset

In [140]:
dataset = spark.read.csv('/home/hasan/DATA SET/datasets_33080_43333_car data.csv', header=True)


##### Initializing Dataset

In [141]:
#head of the dataset
dataset.show()


+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|     Car_Name|Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|
+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|         ritz|2014|         3.35|         5.59|     27000|   Petrol|     Dealer|      Manual|    0|
|          sx4|2013|         4.75|         9.54|     43000|   Diesel|     Dealer|      Manual|    0|
|         ciaz|2017|         7.25|         9.85|      6900|   Petrol|     Dealer|      Manual|    0|
|      wagon r|2011|         2.85|         4.15|      5200|   Petrol|     Dealer|      Manual|    0|
|        swift|2014|          4.6|         6.87|     42450|   Diesel|     Dealer|      Manual|    0|
|vitara brezza|2018|         9.25|         9.83|      2071|   Diesel|     Dealer|      Manual|    0|
|         ciaz|2015|         6.75|         8.12|     18796|   Petrol|     Dealer|      Manu

In [142]:
# checking data type and null value of the column
dataset.printSchema()


root
 |-- Car_Name: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Selling_Price: string (nullable = true)
 |-- Present_Price: string (nullable = true)
 |-- Kms_Driven: string (nullable = true)
 |-- Fuel_Type: string (nullable = true)
 |-- Seller_Type: string (nullable = true)
 |-- Transmission: string (nullable = true)
 |-- Owner: string (nullable = true)



##### Checking null value in every column

In [145]:
from pyspark.sql.functions import isnan, when, count, col

dataset.select([count(when(isnan(c), c)).alias(c) for c in dataset.columns]).show()


+--------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|Car_Name|Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|
+--------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|       0|   0|            0|            0|         0|        0|          0|           0|    0|
+--------+----+-------------+-------------+----------+---------+-----------+------------+-----+



### Converting Dataset to SQL Query

In [161]:
# Register the DataFrame as a SQL temporary view
dataset.createOrReplaceTempView("car_dataset")


##### How to select Column in SQL

In [162]:
#selecting all column
spark.sql("select * from car_dataset").show()


+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|     Car_Name|Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|
+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+
|         ritz|2014|         3.35|         5.59|     27000|   Petrol|     Dealer|      Manual|    0|
|          sx4|2013|         4.75|         9.54|     43000|   Diesel|     Dealer|      Manual|    0|
|         ciaz|2017|         7.25|         9.85|      6900|   Petrol|     Dealer|      Manual|    0|
|      wagon r|2011|         2.85|         4.15|      5200|   Petrol|     Dealer|      Manual|    0|
|        swift|2014|          4.6|         6.87|     42450|   Diesel|     Dealer|      Manual|    0|
|vitara brezza|2018|         9.25|         9.83|      2071|   Diesel|     Dealer|      Manual|    0|
|         ciaz|2015|         6.75|         8.12|     18796|   Petrol|     Dealer|      Manu

In [163]:
#selecting a column
spark.sql("select Car_Name from car_dataset").show()


+-------------+
|     Car_Name|
+-------------+
|         ritz|
|          sx4|
|         ciaz|
|      wagon r|
|        swift|
|vitara brezza|
|         ciaz|
|      s cross|
|         ciaz|
|         ciaz|
|     alto 800|
|         ciaz|
|         ciaz|
|       ertiga|
|        dzire|
|       ertiga|
|       ertiga|
|       ertiga|
|      wagon r|
|          sx4|
+-------------+
only showing top 20 rows



In [164]:
#selecting two column
spark.sql("select Selling_Price,Present_Price from car_dataset").show()


+-------------+-------------+
|Selling_Price|Present_Price|
+-------------+-------------+
|         3.35|         5.59|
|         4.75|         9.54|
|         7.25|         9.85|
|         2.85|         4.15|
|          4.6|         6.87|
|         9.25|         9.83|
|         6.75|         8.12|
|          6.5|         8.61|
|         8.75|         8.89|
|         7.45|         8.92|
|         2.85|          3.6|
|         6.85|        10.38|
|          7.5|         9.94|
|          6.1|         7.71|
|         2.25|         7.21|
|         7.75|        10.79|
|         7.25|        10.79|
|         7.75|        10.79|
|         3.25|         5.09|
|         2.65|         7.98|
+-------------+-------------+
only showing top 20 rows



### Use of where in SQL

In [165]:
#selecting a specific value
spark.sql("select Year from car_dataset where Year==2014").show()


+----+
|Year|
+----+
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
|2014|
+----+
only showing top 20 rows



In [166]:
#price in specific year
spark.sql("select Year,Present_Price from car_dataset where Year==2014").show()


+----+-------------+
|Year|Present_Price|
+----+-------------+
|2014|         5.59|
|2014|         6.87|
|2014|         3.46|
|2014|         7.49|
|2014|         9.95|
|2014|         8.06|
|2014|         3.98|
|2014|         8.06|
|2014|        12.04|
|2014|          6.8|
|2014|        35.96|
|2014|        35.96|
|2014|         6.95|
|2014|         6.76|
|2014|         8.93|
|2014|         6.76|
|2014|        16.09|
|2014|         3.45|
|2014|          2.4|
|2014|          1.2|
+----+-------------+
only showing top 20 rows



In [167]:
# advance filtering
spark.sql("select Year,Seller_Type from car_dataset where (Year==2014 AND Seller_Type=='Dealer')").show()


+----+-----------+
|Year|Seller_Type|
+----+-----------+
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
|2014|     Dealer|
+----+-----------+
only showing top 20 rows



In [168]:
# advance filtering
spark.sql("select Year,Seller_Type,Kms_Driven from car_dataset where (Year==2015 AND Seller_Type=='Dealer' AND Kms_Driven>50000)").show()


+----+-----------+----------+
|Year|Seller_Type|Kms_Driven|
+----+-----------+----------+
|2015|     Dealer|     51000|
|2015|     Dealer|     61381|
|2015|     Dealer|     68000|
|2015|     Dealer|     60076|
|2015|     Dealer|     60000|
+----+-----------+----------+



### Statistical SQL

In [169]:
# summation of Present_Price
spark.sql("select SUM(Present_Price) from car_dataset").show()


+----------------------------------+
|sum(CAST(Present_Price AS DOUBLE))|
+----------------------------------+
|                2296.1700000000005|
+----------------------------------+



In [170]:
# Maximum of Present_Price
spark.sql("select MAX(Present_Price) from car_dataset").show()


+------------------+
|max(Present_Price)|
+------------------+
|              92.6|
+------------------+



In [171]:
# average of Present_Price
spark.sql("select AVG(Present_Price) from car_dataset").show()


+----------------------------------+
|avg(CAST(Present_Price AS DOUBLE))|
+----------------------------------+
|                 7.628471760797344|
+----------------------------------+



In [172]:
#printing distinct element of a column
spark.sql("select DISTINCT Fuel_Type from car_dataset ").show()


+---------+
|Fuel_Type|
+---------+
|   Diesel|
|      CNG|
|   Petrol|
+---------+



In [173]:
#printing distinct element of a column
spark.sql(" select SUM(Present_Price) from car_dataset where Year=2015 ").show()


+----------------------------------+
|sum(CAST(Present_Price AS DOUBLE))|
+----------------------------------+
|                 499.1000000000001|
+----------------------------------+



### Some Complex query

In [174]:
#printing price of some specific car
spark.sql(" select SUM(Present_Price) from car_dataset where Car_Name IN ('ritz','sx4','ciaz','wagon r','swift','vitara brezza') ").show()


+----------------------------------+
|sum(CAST(Present_Price AS DOUBLE))|
+----------------------------------+
|                216.89999999999995|
+----------------------------------+



In [175]:
#printing price of some specific car
spark.sql(" select Car_Name, SUM(Present_Price) from car_dataset where Present_Price>3 Group By Car_Name ").show()


+-------------+----------------------------------+
|     Car_Name|sum(CAST(Present_Price AS DOUBLE))|
+-------------+----------------------------------+
|corolla altis|                            281.32|
|      wagon r|                              17.8|
|  etios cross|                              23.9|
|        creta|                              40.8|
|          i10|                             22.83|
| land cruiser|                              92.6|
|       ertiga|                             57.74|
|         brio|                             60.44|
|        amaze|                 47.49999999999999|
|       baleno|                              7.87|
|         ciaz|                             87.81|
|        dzire|                             30.53|
|      s cross|                              8.61|
|      corolla|                             12.35|
|         city|                            306.59|
|          sx4|                              48.4|
|      etios g|                

In [176]:
# multiple column with a condition
spark.sql(" select Car_Name from car_dataset where Year=2014").show()


+--------------+
|      Car_Name|
+--------------+
|          ritz|
|         swift|
|      alto k10|
|         swift|
|        ertiga|
|         dzire|
|      alto k10|
|         dzire|
|          ciaz|
|       etios g|
|      fortuner|
|      fortuner|
|    etios liva|
|    etios liva|
|   etios cross|
|       etios g|
|        innova|
|Hyosung GT250R|
| KTM 390 Duke |
| Honda CBR 150|
+--------------+
only showing top 20 rows



In [191]:
# multiple column with a condition
spark.sql(" select (Present_Price - Selling_Price) as price_difference from car_dataset ").show(5)


+------------------+
|  price_difference|
+------------------+
|2.2399999999999998|
| 4.789999999999999|
|2.5999999999999996|
|1.3000000000000003|
|2.2700000000000005|
+------------------+
only showing top 5 rows



In [190]:
# filterring with column name
spark.sql("select max(Year) as maximum_year, count(Seller_Type) as seller_type_number from car_dataset").show()


+------------+------------------+
|maximum_year|seller_type_number|
+------------+------------------+
|        2018|               301|
+------------+------------------+



In [200]:
# filterring with column name
spark.sql("select max(Year) as maximum_year, count(Seller_Type) as seller_type_number from car_dataset where Fuel_Type=='Diesel' ").show()


+------------+------------------+
|maximum_year|seller_type_number|
+------------+------------------+
|        2018|                60|
+------------+------------------+



### use of concat

In [205]:
# adding two column
spark.sql("select concat(Car_Name,', ',Fuel_Type) as name_and_fuel_type from car_dataset").show()


+--------------------+
|  name_and_fuel_type|
+--------------------+
|        ritz, Petrol|
|         sx4, Diesel|
|        ciaz, Petrol|
|     wagon r, Petrol|
|       swift, Diesel|
|vitara brezza, Di...|
|        ciaz, Petrol|
|     s cross, Diesel|
|        ciaz, Diesel|
|        ciaz, Diesel|
|    alto 800, Petrol|
|        ciaz, Diesel|
|        ciaz, Petrol|
|      ertiga, Petrol|
|       dzire, Petrol|
|      ertiga, Diesel|
|      ertiga, Diesel|
|      ertiga, Diesel|
|        wagon r, CNG|
|         sx4, Petrol|
+--------------------+
only showing top 20 rows



#### finding difference of two column

In [177]:
from pyspark.sql.functions import col

In [178]:
difference_of_price=dataset.select(((col("Present_Price") - col("Selling_Price"))).alias("sub"))
difference_of_price.show()


+-------------------+
|                sub|
+-------------------+
| 2.2399999999999998|
|  4.789999999999999|
| 2.5999999999999996|
| 1.3000000000000003|
| 2.2700000000000005|
| 0.5800000000000001|
| 1.3699999999999992|
| 2.1099999999999994|
|0.14000000000000057|
| 1.4699999999999998|
|               0.75|
|  3.530000000000001|
| 2.4399999999999995|
| 1.6100000000000003|
|               4.96|
|  3.039999999999999|
|  3.539999999999999|
|  3.039999999999999|
| 1.8399999999999999|
|               5.33|
+-------------------+
only showing top 20 rows



#### adding new column with original dataset

In [182]:
dataset_with_new_column=dataset.withColumn("sum", col("Present_Price")-col("Selling_Price"))
dataset_with_new_column.show(7)


+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+------------------+
|     Car_Name|Year|Selling_Price|Present_Price|Kms_Driven|Fuel_Type|Seller_Type|Transmission|Owner|               sum|
+-------------+----+-------------+-------------+----------+---------+-----------+------------+-----+------------------+
|         ritz|2014|         3.35|         5.59|     27000|   Petrol|     Dealer|      Manual|    0|2.2399999999999998|
|          sx4|2013|         4.75|         9.54|     43000|   Diesel|     Dealer|      Manual|    0| 4.789999999999999|
|         ciaz|2017|         7.25|         9.85|      6900|   Petrol|     Dealer|      Manual|    0|2.5999999999999996|
|      wagon r|2011|         2.85|         4.15|      5200|   Petrol|     Dealer|      Manual|    0|1.3000000000000003|
|        swift|2014|          4.6|         6.87|     42450|   Diesel|     Dealer|      Manual|    0|2.2700000000000005|
|vitara brezza|2018|         9.25|      