# Exploratory Data Analysis
### 02/17/2020
### Group 6: Lukas Hering, Manothay Tommy Luangrath, Ian Luck, Andrea Simenstad

### Prepare Environment

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
findspark.init() 

### Configure a SparkSession

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
	.master("local[*]") \
	.appName("Learning_Spark") \
	.getOrCreate() 

### Load used cars database from csv
Source: https://www.kaggle.com/orgesleka/used-cars-database


In [0]:
from google.colab import files
files.upload()

Saving autos.csv to autos.csv


In [0]:
data = spark.read.csv('autos.csv',inferSchema=True, header=True)

# Data Exploration

In [0]:
number_of_rows = data.count()
number_of_fields = len(data.columns)
"There are {} data entries. Each entry has {} fields".format(number_of_rows, number_of_fields)

'There are 371824 data entries. Each entry has 20 fields'

In [0]:
data.show(5)

+-------------------+--------------------+------+---------+-----+------+-----------+------------------+---------+-------+-----+---------+-------------------+--------+----------+-----------------+-------------------+------------+----------+-------------------+
|        dateCrawled|                name|seller|offerType|price|abtest|vehicleType|yearOfRegistration|  gearbox|powerPS|model|kilometer|monthOfRegistration|fuelType|     brand|notRepairedDamage|        dateCreated|nrOfPictures|postalCode|           lastSeen|
+-------------------+--------------------+------+---------+-----+------+-----------+------------------+---------+-------+-----+---------+-------------------+--------+----------+-----------------+-------------------+------------+----------+-------------------+
|2016-03-24 11:52:17|          Golf_3_1.6|privat|  Angebot|  480|  test|       null|              1993|  manuell|      0| golf|   150000|                  0|  benzin|volkswagen|             null|2016-03-24 00:00:00|     

In [0]:
data.printSchema()

root
 |-- dateCrawled: timestamp (nullable = true)
 |-- name: string (nullable = true)
 |-- seller: string (nullable = true)
 |-- offerType: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- abtest: string (nullable = true)
 |-- vehicleType: string (nullable = true)
 |-- yearOfRegistration: integer (nullable = true)
 |-- gearbox: string (nullable = true)
 |-- powerPS: integer (nullable = true)
 |-- model: string (nullable = true)
 |-- kilometer: integer (nullable = true)
 |-- monthOfRegistration: integer (nullable = true)
 |-- fuelType: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- notRepairedDamage: string (nullable = true)
 |-- dateCreated: timestamp (nullable = true)
 |-- nrOfPictures: integer (nullable = true)
 |-- postalCode: integer (nullable = true)
 |-- lastSeen: timestamp (nullable = true)



In [0]:
data.select("name","dateCreated","lastSeen","price").show(15, truncate=False)

+-----------------------------------------------------------------+-------------------+-------------------+-----+
|name                                                             |dateCreated        |lastSeen           |price|
+-----------------------------------------------------------------+-------------------+-------------------+-----+
|Golf_3_1.6                                                       |2016-03-24 00:00:00|2016-04-07 03:16:57|480  |
|A5_Sportback_2.7_Tdi                                             |2016-03-24 00:00:00|2016-04-07 01:46:50|18300|
|Jeep_Grand_Cherokee_"Overland"                                   |2016-03-14 00:00:00|2016-04-05 12:47:46|9800 |
|GOLF_4_1_4__3T�RER                                               |2016-03-17 00:00:00|2016-03-17 17:40:17|1500 |
|Skoda_Fabia_1.4_TDI_PD_Classic                                   |2016-03-31 00:00:00|2016-04-06 10:17:21|3600 |
|BMW_316i___e36_Limousine___Bastlerfahrzeug__Export               |2016-04-04 00:00:00|2

### Statistical Description of Dataset

In [0]:
data.describe(["price","kilometer"]).show()

+-------+------------------+------------------+
|summary|             price|         kilometer|
+-------+------------------+------------------+
|  count|            371823|            371823|
|   mean|17286.338865535483|125618.56044408226|
| stddev|3586530.1840677676| 40111.62016494434|
|    min|                 0|              5000|
|    max|        2147483647|            150000|
+-------+------------------+------------------+



In [0]:
group_by_gearbox = data.groupby('gearbox')
group_by_brand = data.groupby('brand')
group_by_notRepairedDamage = data.groupby('notRepairedDamage')

In [0]:
group_by_gearbox.agg({'price': 'avg'}).show()

+---------+------------------+
|  gearbox|        avg(price)|
+---------+------------------+
|     null|43990.346585570886|
|automatik|15145.544156332206|
|  manuell|15920.486752589904|
+---------+------------------+



In [0]:
group_by_brand.agg({'price': 'avg'}).show()

+-------------+------------------+
|        brand|        avg(price)|
+-------------+------------------+
|       jaguar|14228.083735909822|
|     daihatsu|1691.6815365551424|
|   mitsubishi|3274.6647078028077|
|         null|              null|
|         lada|3037.4444444444443|
|       toyota| 5235.104468085106|
|         seat| 4356.156703672075|
|         saab| 3808.686090225564|
|   land_rover|16698.858625162127|
|      peugeot| 3168.667391698387|
|     chrysler| 3912.255158184319|
|      citroen| 8875.202082529888|
|         audi| 15863.35696264097|
|mercedes_benz|17244.502687715725|
|          bmw|14838.656460137465|
|         jeep|11998.898514851486|
|       lancia| 3165.754132231405|
|        skoda| 6413.099698955198|
|        rover| 1508.022448979592|
|      hyundai| 5417.859139490271|
+-------------+------------------+
only showing top 20 rows



In [0]:
group_by_brand.agg({'price': 'count'}).show()

+-------------+------------+
|        brand|count(price)|
+-------------+------------+
|       jaguar|         621|
|     daihatsu|         807|
|   mitsubishi|        3063|
|         null|           0|
|         lada|         225|
|       toyota|        4700|
|         seat|        7026|
|         saab|         532|
|   land_rover|         771|
|      peugeot|       11034|
|     chrysler|        1454|
|      citroen|        5186|
|         audi|       32897|
|mercedes_benz|       35346|
|          bmw|       40301|
|         jeep|         808|
|       lancia|         484|
|        skoda|        5647|
|        rover|         490|
|      hyundai|        3649|
+-------------+------------+
only showing top 20 rows



In [0]:
group_by_notRepairedDamage.agg({'price': 'avg'}).show()

+-----------------+-----------------+
|notRepairedDamage|       avg(price)|
+-----------------+-----------------+
|             nein|9128.952507688218|
|             null| 22760.0665252416|
|               ja|65586.87130267144|
+-----------------+-----------------+



In [0]:
group_by_notRepairedDamage.agg({'price': 'count'}).show()

+-----------------+------------+
|notRepairedDamage|count(price)|
+-----------------+------------+
|             nein|      263390|
|             null|       72123|
|               ja|       36310|
+-----------------+------------+

