In [1]:
# IMPORT LIBRARIES
try:
    # PYSPARK
    from pyspark.sql import SparkSession
    from pyspark import SparkContext
    from pyspark.sql import SQLContext
    from pyspark.sql import DataFrame
    import pyspark.sql.types as tp
    import pyspark.sql.functions as F
    
    #Py Spark ML Libraries
    from pyspark.ml.classification import LogisticRegression
    from pyspark.ml.regression import LinearRegression
    from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler, StandardScaler
    from pyspark.ml import Pipeline
    from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
    
    # OTHER LIBRARIES
    import pandas as pd
    import numpy as np
    import glob
    from functools import reduce
    import seaborn as sns
    import matplotlib.pyplot as plt
    from urllib.request import urlopen
    import datetime
    from pathlib import Path
    
    print('[SUCCESS]')

    #CATCH ERROR IMPORTING A LIBRARY
except ImportError as ie:
    raise ImportError(f'[Error importing]: {ie}')

[SUCCESS]


In [2]:
'''
INITIALIZING SPARK SESSION
- NAME IS SET FOR SPARK SESSION WHEN RUNNING ON LOCAL HOST
'''
spark = SparkSession.builder.master('local').config("spark.executor.memory", "1g").config("spark.driver.memory", "2g").appName('UsedCar_Project').getOrCreate()
print('[SUCCESSFULLY RUNNING SPARK SESSION]')

[SUCCESSFULLY RUNNING SPARK SESSION]


In [3]:
def load_data():
    '''
    car_prices.csv is uploaded to a google bucket for public use. Since file is too large to push to GitHub for others to use from repo
    this function will load the data from the google bucket.
    
    You can run this function each time and it will not download a new dataset each time since the first time you run it, it will download locally to your directory.
    --- unless you delete it each time
    
    Function will check to make sure file is in the directory
    - if it is, load it to a spark dataframe
    - if it is not, download it, then load it to a spark dataframe
    
    SCHEMA:
    - Created a schema to make sure the data types for the file being read is kept
    
    
    Drop Randome Values in state columns
    
    
    WARNING: TO USE THIS FUNCTION, YOU HAVE TO BE RUNNING JUPYTER NOTEBOOK ON A LINUX SERVER (USE DOCKER)
    
    NOTES:
    option("header",True).option("inferSchema", True)
    '''
    
    # CHECKS TO SEE IF FILE EXIST
    path = Path('car_prices.csv') 
    
    # IF FILE DOES NOT EXIST
    if not path.is_file():
        !wget https://storage.googleapis.com/iamangelsh-public-datasets/car_prices.csv 
    
    
    
    # CREATE SCHEMA TO KEEP DATA TYPES
    schema = tp.StructType([tp.StructField('year', tp.IntegerType(), True),
                           tp.StructField('make', tp.StringType(), True),
                           tp.StructField('model', tp.StringType(), True),
                           tp.StructField('trim', tp.StringType(), True),
                           tp.StructField('body', tp.StringType(), True),
                           tp.StructField('transmission', tp.StringType(), True),
                           tp.StructField('vin', tp.StringType(), True),
                           tp.StructField('state', tp.StringType(), True),
                           tp.StructField('condition', tp.DoubleType(), True),
                           tp.StructField('odometer', tp.DoubleType(), True),
                           tp.StructField('color', tp.StringType(), True),
                           tp.StructField('interior', tp.StringType(), True),
                           tp.StructField('seller', tp.StringType(), True),
                           tp.StructField('mmr', tp.IntegerType(), True),
                           tp.StructField('sellingprice', tp.IntegerType(), True),
                           tp.StructField('saledate', tp.StringType(), True)])
    
    
    # LOAD IN DATA WITH SCHEMA
    df = spark.read.csv("car_prices.csv", header = True, sep=",", schema=schema)
    
    # FILTER OUT VIN NUMBERS FROM STATE COLUMN
    df = df.where(F.length(F.col("state")) <= 2)
    
    # DROP ROWS THAT CONTAIN NULL VALUES
    df = df.na.drop('any')
    
    # CREATE THRESHOLD FOR CONDITION COLUMN
    new_df = df.withColumn(
        'condition', 
        F.when(df.condition > 3.75, 'Great'
        ).when((df.condition >= 2) & (df.condition <= 3.75), 'Average'
        ).when(df.condition < 2, 'Bad'))
    
    # DROP COLUMNS THAT WON'T BE USED
    cols = ('trim', 'vin', 'interior', 'seller')
    new_df = new_df.drop(*cols)
    
    
    # USE MM DD YYYY FOR SALEDATE COLUMN
    new_df = df.withColumn(
        'saledate', F.substring('saledate', 5,11)
        ).withColumn(
        'saledate_year', F.substring('saledate', 7,5)
        ).withColumn(
        'saledate_month', F.substring('saledate', 1,3))
    new_df = new_df.withColumn(
        'saledate_year', F.col('saledate_year').cast(tp.IntegerType())
        )
    
    # RETURN NEW DATAFRAME
    return new_df


# LOAD THE DATA
df = load_data()

# SHOW DATA
df.show(5)

# SHOW NUMBER OF COLUMNS AND ROWS
print(f'Number of columns: {len(df.columns)} \nNumber of Rows: {df.count()}')
print()

# SHOW SCHEMA - DATATYPES
df.printSchema()

--2022-04-14 02:38:24--  https://storage.googleapis.com/iamangelsh-public-datasets/car_prices.csv
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.190.80, 142.250.191.112, 142.250.190.112, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.190.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 89088424 (85M) [application/vnd.ms-excel]
Saving to: ‘car_prices.csv’


2022-04-14 02:38:38 (6.36 MB/s) - ‘car_prices.csv’ saved [89088424/89088424]

+----+-----+-------------------+----------+-----+------------+-----------------+-----+---------+--------+-----+--------+--------------------+-----+------------+-----------+-------------+--------------+
|year| make|              model|      trim| body|transmission|              vin|state|condition|odometer|color|interior|              seller|  mmr|sellingprice|   saledate|saledate_year|saledate_month|
+----+-----+-------------------+----------+-----+------------+-----------------+--

In [47]:
####################################################
# Regression using gradient trees regressor
####################################################

# Try to predict selling price using gradient trees regressor
import pyspark.ml.feature as ft

features = ['year','condition']

In [48]:
# Collate the features together and use the ChiSqSelector to select only the top 2 most important features
featuresCreator = ft.VectorAssembler(inputCols=[col for col in features[1:]], outputCol='features')
selector = ft.ChiSqSelector(numTopFeatures=2, outputCol="selectedFeatures", labelCol='sellingprice')

In [49]:
# In order to predict the selling price, we will use the gradient boosted trees generator
import pyspark.ml.regression as reg
regressor = reg.GBTRegressor(maxIter=15, maxDepth=3, labelCol='sellingprice')

In [50]:
# Split data in train and test 
df_train, df_test = df.randomSplit([0.7,0.3], seed=200)

In [31]:
# We put it all together into a Pipeline
pipeline = Pipeline(stages=[featuresCreator, selector, regressor])
selling_price = pipeline.fit(df_train)

In [32]:
# Check if mour model performs well on testing data
import pyspark.ml.evaluation as ev

evaluator = ev.RegressionEvaluator(predictionCol="prediction", labelCol='sellingprice')

print(evaluator.evaluate(selling_price.transform(df_test), {evaluator.metricName: 'r2'}))
print(evaluator.evaluate(selling_price.transform(df_test), {evaluator.metricName: 'rmse'}))

0.2919344116563881
8092.992894809114


In [33]:
##########################################################
# Spark SQL
##########################################################

# Create temporary table
df.createOrReplaceTempView("usedCarsAuction")

In [38]:
# Display top 5 rows
spark.sql("select * from usedCarsAuction limit 5").collect()

[Row(year=2015, make='Kia', model='Sorento', trim='LX', body='SUV', transmission='automatic', vin='5xyktca69fg566472', state='ca', condition=5.0, odometer=16639.0, color='white', interior='black', seller='kia motors america, inc', mmr=20500, sellingprice=21500, saledate='Dec 16 2014', saledate_year=2014, saledate_month='Dec'),
 Row(year=2015, make='Kia', model='Sorento', trim='LX', body='SUV', transmission='automatic', vin='5xyktca69fg561319', state='ca', condition=5.0, odometer=9393.0, color='white', interior='beige', seller='kia motors america, inc', mmr=20800, sellingprice=21500, saledate='Dec 16 2014', saledate_year=2014, saledate_month='Dec'),
 Row(year=2014, make='BMW', model='3 Series', trim='328i SULEV', body='Sedan', transmission='automatic', vin='wba3c1c51ek116351', state='ca', condition=4.5, odometer=1331.0, color='gray', interior='black', seller='financial services remarketing (lease)', mmr=31900, sellingprice=30000, saledate='Jan 15 2015', saledate_year=2015, saledate_mont

In [43]:
# Display top 5 most expnisve cars
spark.sql("Select * from usedCarsAuction order by sellingprice desc limit 5").collect()

[Row(year=2014, make='Ford', model='Escape', trim='Titanium', body='SUV', transmission='automatic', vin='1fmcu9j98eua23833', state='mo', condition=4.3, odometer=27802.0, color='green', interior='tan', seller='ford-lincoln dealer program', mmr=22800, sellingprice=230000, saledate='Feb 25 2015', saledate_year=2015, saledate_month='Feb'),
 Row(year=2011, make='Ferrari', model='458 Italia', trim='Base', body='coupe', transmission='automatic', vin='zff67nfa1b0178698', state='fl', condition=4.6, odometer=12116.0, color='red', interior='black', seller='platinum motor cars', mmr=182000, sellingprice=183000, saledate='Jun 17 2015', saledate_year=2015, saledate_month='Jun'),
 Row(year=2015, make='Mercedes-Benz', model='S-Class', trim='S65 AMG', body='Sedan', transmission='automatic', vin='wddug7kb2fa102347', state='ca', condition=4.1, odometer=5277.0, color='white', interior='white', seller='mercedes-benz usa', mmr=170000, sellingprice=173000, saledate='May 21 2015', saledate_year=2015, saledate

In [44]:
# Display average selling price
spark.sql("Select avg(sellingprice) from usedCarsAuction").collect()

[Row(avg(sellingprice)=13690.403670268623)]

In [45]:
# Groups car by make
spark.sql("Select make from usedCarsAuction group by make").collect()

[Row(make='Volkswagen'),
 Row(make='Oldsmobile'),
 Row(make='Infiniti'),
 Row(make='Lexus'),
 Row(make='Jaguar'),
 Row(make='Saturn'),
 Row(make='FIAT'),
 Row(make='Maserati'),
 Row(make='Scion'),
 Row(make='Rolls-Royce'),
 Row(make='Jeep'),
 Row(make='Mitsubishi'),
 Row(make='Kia'),
 Row(make='Chevrolet'),
 Row(make='Volvo'),
 Row(make='Hyundai'),
 Row(make='Saab'),
 Row(make='Honda'),
 Row(make='MINI'),
 Row(make='Audi'),
 Row(make='Lamborghini'),
 Row(make='Ram'),
 Row(make='Cadillac'),
 Row(make='Isuzu'),
 Row(make='Plymouth'),
 Row(make='Pontiac'),
 Row(make='Geo'),
 Row(make='Land Rover'),
 Row(make='Mercedes-Benz'),
 Row(make='Mercury'),
 Row(make='Daewoo'),
 Row(make='Lincoln'),
 Row(make='Chrysler'),
 Row(make='Tesla'),
 Row(make='BMW'),
 Row(make='Suzuki'),
 Row(make='Acura'),
 Row(make='HUMMER'),
 Row(make='Ferrari'),
 Row(make='GMC'),
 Row(make='Buick'),
 Row(make='Porsche'),
 Row(make='Lotus'),
 Row(make='Fisker'),
 Row(make='smart'),
 Row(make='Bentley'),
 Row(make='Toyot

In [46]:
# Display average odometer reading
spark.sql("Select avg(odometer) from usedCarsAuction").collect()

[Row(avg(odometer)=66701.07000313337)]

In [52]:
# Display oldest vehicles
spark.sql("Select * from usedCarsAuction order by year limit 5").collect()

[Row(year=1990, make='Honda', model='Accord', trim='EX', body='Sedan', transmission='automatic', vin='jhmcb7661lc036504', state='tx', condition=2.0, odometer=19279.0, color='gray', interior='tan', seller='automotive remarketing inc', mmr=875, sellingprice=350, saledate='Dec 23 2014', saledate_year=2014, saledate_month='Dec'),
 Row(year=1990, make='Mercedes-Benz', model='300-Class', trim='300E', body='Sedan', transmission='automatic', vin='wdbea30d6lb200847', state='nv', condition=2.0, odometer=141799.0, color='white', interior='—', seller='automotive remarketing inc', mmr=425, sellingprice=300, saledate='Jan 22 2015', saledate_year=2015, saledate_month='Jan'),
 Row(year=1990, make='Toyota', model='Camry', trim='Deluxe', body='Sedan', transmission='automatic', vin='4t1sv21e0lu227097', state='ca', condition=2.0, odometer=122877.0, color='blue', interior='—', seller='charitable auto resource', mmr=700, sellingprice=400, saledate='Dec 30 2014', saledate_year=2014, saledate_month='Dec'),
 R