# Initializing

In [None]:
!pip3 install -q pyspark

[K     |████████████████████████████████| 281.4 MB 36 kB/s 
[K     |████████████████████████████████| 198 kB 57.2 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc

# Loading Data

In [None]:
bike = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/content/BIKE DETAILS.csv")

In [None]:
bike.printSchema()

root
 |-- name: string (nullable = true)
 |-- selling_price: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- seller_type: string (nullable = true)
 |-- owner: string (nullable = true)
 |-- km_driven: integer (nullable = true)
 |-- ex_showroom_price: integer (nullable = true)



In [None]:
bike.head(5)

[Row(name='Royal Enfield Classic 350', selling_price=175000, year=2019, seller_type='Individual', owner='1st owner', km_driven=350, ex_showroom_price=None),
 Row(name='Honda Dio', selling_price=45000, year=2017, seller_type='Individual', owner='1st owner', km_driven=5650, ex_showroom_price=None),
 Row(name='Royal Enfield Classic Gunmetal Grey', selling_price=150000, year=2018, seller_type='Individual', owner='1st owner', km_driven=12000, ex_showroom_price=148114),
 Row(name='Yamaha Fazer FI V 2.0 [2016-2018]', selling_price=65000, year=2015, seller_type='Individual', owner='1st owner', km_driven=23000, ex_showroom_price=89643),
 Row(name='Yamaha SZ [2013-2014]', selling_price=20000, year=2011, seller_type='Individual', owner='2nd owner', km_driven=21000, ex_showroom_price=None)]

# Data Manipulations

In [None]:
bike.show()

+--------------------+-------------+----+-----------+---------+---------+-----------------+
|                name|selling_price|year|seller_type|    owner|km_driven|ex_showroom_price|
+--------------------+-------------+----+-----------+---------+---------+-----------------+
|Royal Enfield Cla...|       175000|2019| Individual|1st owner|      350|             null|
|           Honda Dio|        45000|2017| Individual|1st owner|     5650|             null|
|Royal Enfield Cla...|       150000|2018| Individual|1st owner|    12000|           148114|
|Yamaha Fazer FI V...|        65000|2015| Individual|1st owner|    23000|            89643|
|Yamaha SZ [2013-2...|        20000|2011| Individual|2nd owner|    21000|             null|
|    Honda CB Twister|        18000|2010| Individual|1st owner|    60000|            53857|
|Honda CB Hornet 160R|        78500|2018| Individual|1st owner|    17000|            87719|
|Royal Enfield Bul...|       180000|2008| Individual|2nd owner|    39000|       

In [None]:
bike.count()

1061

In [None]:
len(bike.columns)

7

In [None]:
bike.columns

['name',
 'selling_price',
 'year',
 'seller_type',
 'owner',
 'km_driven',
 'ex_showroom_price']

In [None]:
bike.describe().show()

+-------+---------+------------------+------------------+-----------+---------+------------------+-----------------+
|summary|     name|     selling_price|              year|seller_type|    owner|         km_driven|ex_showroom_price|
+-------+---------+------------------+------------------+-----------+---------+------------------+-----------------+
|  count|     1061|              1061|              1061|       1061|     1061|              1061|              626|
|   mean|     null|59638.151743638075|2013.8671065032988|       null|     null| 34359.83317624882|87958.71405750798|
| stddev|     null| 56304.29197302415| 4.301191368192697|       null|     null|51623.152701596315|77496.58718945317|
|    min|Activa 3g|              5000|              1988|     Dealer|1st owner|               350|            30490|
|    max| Yo Style|            760000|              2020| Individual|4th owner|            880000|          1278000|
+-------+---------+------------------+------------------+-------

In [None]:
bike.describe('selling_price').show()

+-------+------------------+
|summary|     selling_price|
+-------+------------------+
|  count|              1061|
|   mean|59638.151743638075|
| stddev| 56304.29197302415|
|    min|              5000|
|    max|            760000|
+-------+------------------+



In [None]:
bike.describe('km_driven').show()

+-------+------------------+
|summary|         km_driven|
+-------+------------------+
|  count|              1061|
|   mean| 34359.83317624882|
| stddev|51623.152701596315|
|    min|               350|
|    max|            880000|
+-------+------------------+



In [None]:
bike.describe('ex_showroom_price').show()

+-------+-----------------+
|summary|ex_showroom_price|
+-------+-----------------+
|  count|              626|
|   mean|87958.71405750798|
| stddev|77496.58718945317|
|    min|            30490|
|    max|          1278000|
+-------+-----------------+



In [None]:
bike.select('name').distinct().count()

279

In [None]:
bike.select('seller_type').distinct().count()

2

In [None]:
bike.select('owner').distinct().count()

4

In [None]:
bike.crosstab('name', 'owner').show()

+--------------------+---------+---------+---------+---------+
|          name_owner|1st owner|2nd owner|3rd owner|4th owner|
+--------------------+---------+---------+---------+---------+
|   Yamaha YZF R15 V3|        2|        0|        0|        0|
|     Honda Activa 4G|        3|        0|        0|        0|
|   Bajaj Avenger 220|        3|        0|        0|        0|
|       Honda Karizma|        2|        0|        0|        0|
|Bajaj Avenger Cru...|        4|        0|        0|        0|
|           TVS Sport|        3|        0|        0|        0|
|   Hero Karizma 2014|        4|        1|        0|        0|
|      Mahindra Rodeo|        1|        0|        0|        0|
|          TVS Victor|        3|        0|        0|        0|
|     Hero Honda Hunk|        7|        0|        0|        0|
|Hero Splendor iSmart|        2|        1|        0|        0|
|Hero Honda Passio...|        1|        0|        0|        0|
|    Honda Activa 125|        4|        0|        0|   

In [None]:
bike.select('name','owner').dropDuplicates().show()

+--------------------+---------+
|                name|    owner|
+--------------------+---------+
|  Hero Splendor Plus|1st owner|
|  Hero Honda Passion|2nd owner|
|Honda CB Unicorn 150|1st owner|
|Hero Karizma [200...|1st owner|
|Bajaj Avenger 150...|1st owner|
|       Jawa Standard|1st owner|
|Kawasaki Ninja 65...|2nd owner|
|Bajaj Avenger Cru...|1st owner|
|Hero Honda Splend...|1st owner|
|    Hero Glamour 125|1st owner|
|Honda Activa 125 ...|1st owner|
|          KTM RC 390|1st owner|
|  Hero Xtreme Sports|1st owner|
|KTM 390 Duke ABS ...|1st owner|
|Royal Enfield Con...|1st owner|
|         Bajaj Boxer|1st owner|
|  Bajaj Discover 135|1st owner|
|    Yamaha Fazer Dlx|1st owner|
|      Mahindra Gusto|2nd owner|
|   Hero Achiever 150|1st owner|
+--------------------+---------+
only showing top 20 rows



In [None]:
bike.dropna().count()

626

In [None]:
bike.fillna(0).show(10)

+--------------------+-------------+----+-----------+---------+---------+-----------------+
|                name|selling_price|year|seller_type|    owner|km_driven|ex_showroom_price|
+--------------------+-------------+----+-----------+---------+---------+-----------------+
|Royal Enfield Cla...|       175000|2019| Individual|1st owner|      350|                0|
|           Honda Dio|        45000|2017| Individual|1st owner|     5650|                0|
|Royal Enfield Cla...|       150000|2018| Individual|1st owner|    12000|           148114|
|Yamaha Fazer FI V...|        65000|2015| Individual|1st owner|    23000|            89643|
|Yamaha SZ [2013-2...|        20000|2011| Individual|2nd owner|    21000|                0|
|    Honda CB Twister|        18000|2010| Individual|1st owner|    60000|            53857|
|Honda CB Hornet 160R|        78500|2018| Individual|1st owner|    17000|            87719|
|Royal Enfield Bul...|       180000|2008| Individual|2nd owner|    39000|       

In [None]:
bike.filter(bike.selling_price>100000).count()

146

In [None]:
bike.groupby('name').agg({'selling_price': 'mean'}).show()

+--------------------+------------------+
|                name|avg(selling_price)|
+--------------------+------------------+
|           Hero Hunk|           31000.0|
|            TVS Wego|           25000.0|
|     Honda Activa 5G|55666.666666666664|
|Bajaj Avenger Cru...|           81750.0|
|         TVS Max DLX|           26000.0|
|         Honda Shine|33399.933333333334|
|   Yamaha FZ S V 2.0|           65562.5|
|Hero Splendor iSmart|43333.333333333336|
|Royal Enfield Thu...|          130000.0|
|Honda Activa i [2...|           35000.0|
|Royal Enfield Cla...|          160000.0|
|Royal Enfield Cla...|          140000.0|
|      TVS Victor GLX|           22500.0|
|  UM Renegade Mojave|          170000.0|
|        Vespa LX 125|           50000.0|
|           Activa 4g|           40000.0|
|          TVS Victor|           16000.0|
|        Yamaha RX135|           65000.0|
|Royal Enfield Thu...|102626.31578947368|
|      Hyosung GT250R|          135000.0|
+--------------------+------------

In [None]:
bike.groupby('name').agg({'km_driven': 'mean'}).show()

+--------------------+------------------+
|                name|    avg(km_driven)|
+--------------------+------------------+
|           Hero Hunk|           41400.0|
|            TVS Wego|           29250.0|
|     Honda Activa 5G|10935.583333333334|
|Bajaj Avenger Cru...|            7687.5|
|         TVS Max DLX|           28000.0|
|         Honda Shine|           34223.2|
|   Yamaha FZ S V 2.0|           27475.0|
|Hero Splendor iSmart|13018.333333333334|
|Royal Enfield Thu...| 7466.666666666667|
|Honda Activa i [2...|           20000.0|
|Royal Enfield Cla...|            7500.0|
|Royal Enfield Cla...|           25000.0|
|      TVS Victor GLX|           30000.0|
|  UM Renegade Mojave|            1400.0|
|        Vespa LX 125|            3909.0|
|           Activa 4g|            1300.0|
|          TVS Victor|50566.666666666664|
|        Yamaha RX135|           16500.0|
|Royal Enfield Thu...|19506.684210526317|
|      Hyosung GT250R|           16500.0|
+--------------------+------------

In [None]:
bike.groupby('name').count().show()

+--------------------+-----+
|                name|count|
+--------------------+-----+
|           Hero Hunk|    5|
|            TVS Wego|    4|
|     Honda Activa 5G|   12|
|Bajaj Avenger Cru...|    4|
|         TVS Max DLX|    1|
|         Honda Shine|   15|
|   Yamaha FZ S V 2.0|   16|
|Hero Splendor iSmart|    3|
|Royal Enfield Thu...|    3|
|Honda Activa i [2...|    3|
|Royal Enfield Cla...|    1|
|Royal Enfield Cla...|    1|
|      TVS Victor GLX|    2|
|  UM Renegade Mojave|    1|
|        Vespa LX 125|    1|
|           Activa 4g|    1|
|          TVS Victor|    3|
|        Yamaha RX135|    1|
|Royal Enfield Thu...|   19|
|      Hyosung GT250R|    1|
+--------------------+-----+
only showing top 20 rows



In [None]:
bike.groupby('year').count().show()

+----+-----+
|year|count|
+----+-----+
|2003|    1|
|2007|   29|
|2018|  131|
|2015|  100|
|2006|   20|
|2013|   73|
|1997|    2|
|1988|    1|
|2014|   91|
|2019|   86|
|2004|    5|
|1991|    1|
|1998|    3|
|2020|    3|
|2012|   70|
|2009|   28|
|2016|  107|
|1995|    1|
|2001|    2|
|2005|   14|
+----+-----+
only showing top 20 rows



In [None]:
bike.groupby('owner').count().show()

+---------+-----+
|    owner|count|
+---------+-----+
|2nd owner|  123|
|4th owner|    3|
|3rd owner|   11|
|1st owner|  924|
+---------+-----+

