In [1]:
# Import Libraries
try:
    # PYSPARK
    from pyspark.sql import SparkSession
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    from pyspark.sql import DataFrame
    import pyspark.sql.types as tp
    import pyspark.sql.functions as F
    
    # OTHER LIBRARIES
    import pandas as pd
    import numpy as np
    import glob
    from functools import reduce
    import seaborn as sns
    import matplotlib.pyplot as plt
    from urllib.request import urlopen
    import datetime
    from pathlib import Path
    
    print('[SUCCESS]')

    #CATCH ERROR IMPORTING A LIBRARY
except ImportError as ie:
    raise ImportError(f'[Error importing]: {ie}')

[SUCCESS]


In [2]:
# INITIALIZING OUR SPARK SESSION AND APP
# - this will be the name for the spark session when running our individual notebooks
spark = SparkSession.builder.master('local').config("spark.executor.memory", "1g").config("spark.driver.memory", "2g").appName('UsedCar_Project').getOrCreate()

In [3]:
def load_data():
   
     # CHECKS TO SEE IF FILE EXIST
    path = Path('car_prices.csv') 
    
    # IF FILE DOES NOT EXIST
    if not path.is_file():
        !wget https://storage.googleapis.com/iamangelsh-public-datasets/car_prices.csv 
    
    
    
    # CREATE SCHEMA TO KEEP DATA TYPES
    schema = tp.StructType([tp.StructField('year', tp.IntegerType(), True),
                           tp.StructField('make', tp.StringType(), True),
                           tp.StructField('model', tp.StringType(), True),
                           tp.StructField('trim', tp.StringType(), True),
                           tp.StructField('body', tp.StringType(), True),
                           tp.StructField('transmission', tp.StringType(), True),
                           tp.StructField('vin', tp.StringType(), True),
                           tp.StructField('state', tp.StringType(), True),
                           tp.StructField('condition', tp.DoubleType(), True),
                           tp.StructField('odometer', tp.DoubleType(), True),
                           tp.StructField('color', tp.StringType(), True),
                           tp.StructField('interior', tp.StringType(), True),
                           tp.StructField('seller', tp.StringType(), True),
                           tp.StructField('mmr', tp.IntegerType(), True),
                           tp.StructField('sellingprice', tp.IntegerType(), True),
                           tp.StructField('saledate', tp.StringType(), True)])
    
    
    # LOAD IN DATA WITH SCHEMA
    df = spark.read.csv("car_prices.csv", header = True, sep=",", schema=schema)
    
    # FILTER OUT VIN NUMBERS FROM STATE COLUMN
    df = df.where(F.length(F.col("state")) <= 2)
    
    # DROP ROWS THAT CONTAIN NULL VALUES
    df = df.na.drop('any')
    
    # CREATE THRESHOLD FOR CONDITION COLUMN
    new_df = df.withColumn(
        'condition', 
        F.when(df.condition > 3.75, 'Great'
        ).when((df.condition >= 2) & (df.condition <= 3.75), 'Average'
        ).when(df.condition < 2, 'Bad'))
    
    # DROP COLUMNS THAT WON'T BE USED
    cols = ('trim', 'vin', 'interior', 'seller')
    new_df = new_df.drop(*cols)
    
    
    # USE MM DD YYYY FOR SALEDATE COLUMN
    new_df = df.withColumn(
        'saledate', F.substring('saledate', 5,11)
        ).withColumn(
        'saledate_year', F.substring('saledate', 7,5)
        ).withColumn(
        'saledate_month', F.substring('saledate', 1,3))
    new_df = new_df.withColumn(
        'saledate_year', F.col('saledate_year').cast(tp.IntegerType())
        )
    
    # RETURN NEW DATAFRAME
    return new_df


# LOAD THE DATA
df = load_data()

# SHOW DATA
df.show(5)

# SHOW NUMBER OF COLUMNS AND ROWS
print(f'Number of columns: {len(df.columns)} \nNumber of Rows: {df.count()}')
print()

# SHOW SCHEMA - DATATYPES
df.printSchema()

+----+-----+-------------------+----------+-----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-----+------------+-----------+-------------+--------------+
|year| make|              model|      trim| body|transmission|              vin|state|condition|odometer|color|interior|              seller|  mmr|sellingprice|   saledate|saledate_year|saledate_month|
+----+-----+-------------------+----------+-----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-----+------------+-----------+-------------+--------------+
|2015|  Kia|            Sorento|        LX|  SUV|   automatic|5xyktca69fg566472|   ca|      5.0| 16639.0|white|   black|kia motors americ...|20500|       21500|Dec 16 2014|         2014|           Dec|
|2015|  Kia|            Sorento|        LX|  SUV|   automatic|5xyktca69fg561319|   ca|      5.0|  9393.0|white|   beige|kia motors americ...|20800|       21500|Dec 16 2014|         2014|      

# Analysis Below

In [4]:
df.head()

Row(year=2015, make='Kia', model='Sorento', trim='LX', body='SUV', transmission='automatic', vin='5xyktca69fg566472', state='ca', condition=5.0, odometer=16639.0, color='white', interior='black', seller='kia motors america, inc', mmr=20500, sellingprice=21500, saledate='Dec 16 2014', saledate_year=2014, saledate_month='Dec')

In [5]:
df.toPandas()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate,saledate_year,saledate_month
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,"kia motors america, inc",20500,21500,Dec 16 2014,2014,Dec
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,"kia motors america, inc",20800,21500,Dec 16 2014,2014,Dec
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,4.5,1331.0,gray,black,financial services remarketing (lease),31900,30000,Jan 15 2015,2015,Jan
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,4.1,14282.0,white,black,volvo na rep/world omni,27500,27750,Jan 29 2015,2015,Jan
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,4.3,2641.0,gray,black,financial services remarketing (lease),66000,67000,Dec 18 2014,2014,Dec
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
472331,2011,BMW,5 Series,528i,Sedan,automatic,wbafr1c53bc744672,fl,3.9,66403.0,white,brown,lauderdale imports ltd bmw pembrok pines,20300,22800,Jul 07 2015,2015,Jul
472332,2012,Ram,2500,Power Wagon,Crew Cab,automatic,3c6td5et6cg112407,wa,5.0,54393.0,white,black,i -5 uhlmann rv,30200,30800,Jul 08 2015,2015,Jul
472333,2012,BMW,X5,xDrive35d,SUV,automatic,5uxzw0c58cl668465,ca,4.8,50561.0,black,black,financial services remarketing (lease),29800,34000,Jul 08 2015,2015,Jul
472334,2015,Nissan,Altima,2.5 S,sedan,automatic,1n4al3ap0fc216050,ga,3.8,16658.0,white,black,enterprise vehicle exchange / tra / rental / t...,15100,11100,Jul 09 2015,2015,Jul


In [7]:
df.describe().toPandas().head()

Unnamed: 0,summary,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate,saledate_year,saledate_month
0,count,472336.0,472336,472336,472336,472336,472336,472336,472336,472336.0,472336.0,472336,472336,472336,472336.0,472336.0,472336,472336.0,472336
1,mean,2010.2110446800584,,794.6975040463412,366.8299129085117,,,,,3.426575996745779,66701.07000313337,,,,13836.999773466348,13690.403670268624,,2014.922112648623,
2,stddev,3.822131225057812,,808.9891449223677,843.6602013432683,,,,,0.9436589017283668,51939.18342988344,,,,9532.13175274882,9612.962279185413,,0.2679945221970164,
3,min,1990.0,Acura,1 Series,!,Access Cab,automatic,137za84341e193591,al,1.0,1.0,beige,beige,"""enterprise vehicle exchange inc. ""dallas""""",25.0,1.0,Apr 01 2015,2014.0,Apr
4,max,2015.0,smart,xD,xDrive50i,xtracab,manual,zhwgu22t97la05738,wi,5.0,999999.0,—,—,zygi auto corp,182000.0,230000.0,May 31 2015,2015.0,May


In [15]:
df.createOrReplaceTempView("df")

In [44]:
spark.sql("select make, count(sellingprice) from df group by make order by count(sellingprice) desc").show(10)

+---------+-------------------+
|     make|count(sellingprice)|
+---------+-------------------+
|     Ford|              81014|
|Chevrolet|              54150|
|   Nissan|              44043|
|   Toyota|              35313|
|    Dodge|              27183|
|    Honda|              24781|
|  Hyundai|              18663|
|      BMW|              17509|
|      Kia|              15830|
| Chrysler|              15135|
+---------+-------------------+
only showing top 10 rows



In [26]:
spark.sql("select make, avg(sellingprice) from df group by make order by avg(sellingprice) desc").show()

+-------------+------------------+
|         make| avg(sellingprice)|
+-------------+------------------+
|  Rolls-Royce|         153456.25|
|      Ferrari|128852.94117647059|
|  Lamborghini|          111500.0|
|      Bentley| 72713.33333333333|
|        Tesla| 67054.34782608696|
| Aston Martin|           55500.0|
|       Fisker| 46461.11111111111|
|     Maserati| 43729.81651376147|
|        Lotus|           40800.0|
|      Porsche|38932.109766637856|
|   Land Rover| 33225.28744326778|
|          Ram|25257.458209693374|
|Mercedes-Benz|21314.872215876643|
|          BMW| 21290.04386315609|
|     Infiniti| 20562.79630290486|
|        Lexus|20284.201652416177|
|         Audi|20010.167638483967|
|       Jaguar|19429.853619729514|
|      Lincoln| 17547.19143321153|
|          GMC| 16769.46783118151|
+-------------+------------------+
only showing top 20 rows



In [46]:
spark.sql("select model, avg(sellingprice) from df group by model order by avg(sellingprice) desc").show(5)

+----------+------------------+
|     model| avg(sellingprice)|
+----------+------------------+
|458 Italia|          183000.0|
|SLS AMG GT|          156500.0|
|        i8|154222.22222222222|
|     Ghost|         153456.25|
|California|131846.15384615384|
+----------+------------------+
only showing top 5 rows



In [39]:
spark.sql("select state, avg(sellingprice) from df group by state order by avg(sellingprice) desc").show()

+-----+------------------+
|state| avg(sellingprice)|
+-----+------------------+
|   tn| 17285.15289579315|
|   co|15745.498405103668|
|   nv|15576.319380243109|
|   il|15402.065854339131|
|   mi|15182.068946069792|
|   ca|14895.258650114027|
|   mo|14694.777770508963|
|   oh|14459.130974672316|
|   fl|14331.417952500566|
|   wa|14327.867798060468|
|   mn| 14313.54584444187|
|   wi|14105.263810352328|
|   pa|14007.944769249907|
|   nj|13621.127896361344|
|   tx|13574.509302158101|
|   ne|13242.496607869742|
|   ga|12840.065257442064|
|   ny|12589.252684775742|
|   ut|12491.601981351982|
|   hi|12366.514522821577|
+-----+------------------+
only showing top 20 rows



In [40]:
spark.sql("select saledate_month, avg(sellingprice) from df group by saledate_month order by avg(sellingprice) desc").show()

+--------------+------------------+
|saledate_month| avg(sellingprice)|
+--------------+------------------+
|           Jul|16850.777677767775|
|           Jun|14806.526600477684|
|           May|13970.633011755539|
|           Feb|13613.058065603946|
|           Mar|13477.563981276176|
|           Jan|13272.918895542796|
|           Dec|12697.170355191256|
|           Apr|  10169.2455858748|
+--------------+------------------+



In [41]:
spark.sql("select color, avg(sellingprice) from df group by color order by avg(sellingprice) desc").show()

+---------+------------------+
|    color| avg(sellingprice)|
+---------+------------------+
|        —|17824.400217509516|
| charcoal|16256.062937062938|
|    brown|15531.780288204945|
|    black|15474.054737519438|
|off-white|15374.599678456592|
|    white|14863.357243713299|
|     gray|13956.201591098907|
|   orange|13102.091888825866|
|      red|12897.722466252411|
|   yellow|12781.816496756255|
|     lime|           12737.5|
|     pink|12414.102564102564|
|   purple|12278.033858267716|
| burgundy|12117.694411571334|
|   silver| 11846.75045961687|
|     blue|11549.820111472363|
|    beige|  9523.64226222461|
|turquoise| 8816.326315789474|
|    green| 8800.623889971843|
|     gold| 8621.803513541776|
+---------+------------------+



In [42]:
spark.sql("select transmission, avg(sellingprice) from df group by transmission order by avg(sellingprice) desc").show()

+------------+------------------+
|transmission| avg(sellingprice)|
+------------+------------------+
|   automatic|13774.395838359205|
|      manual| 11349.72130546388|
+------------+------------------+



In [30]:
df.corr('odometer','sellingprice')
#as odometer increases, the selling price goes down

-0.5773849819198499

In [32]:
df.corr('year','sellingprice')

0.578918172049782

In [38]:
df.corr('saledate_year','sellingprice')

0.029529080806824714

In [35]:
df.corr('condition','sellingprice')

0.5359901118534247