In [0]:
co2 = spark.read.format('csv') \
           .option('header', 'true') \
           .option('inferSchema', 'true') \
           .load('/FileStore/datasets/co2.csv')

display(co2)

Make,Model,Vehicle Class,Engine Size,Cylinders,Transmission,Fuel Type,Fuel Consumption City,Fuel Consumption Hwy,Fuel Consumption Comb (L/100 km),Fuel Consumption Comb (mpg),CO2 Emissions(g/km)
ACURA,ILX,COMPACT,2.0,4,AS5,Z,9.9,6.7,8.5,33,196
ACURA,ILX,COMPACT,2.4,4,M6,Z,11.2,7.7,9.6,29,221
ACURA,ILX HYBRID,COMPACT,1.5,4,AV7,Z,6.0,5.8,5.9,48,136
ACURA,MDX 4WD,SUV - SMALL,3.5,6,AS6,Z,12.7,9.1,11.1,25,255
ACURA,RDX AWD,SUV - SMALL,3.5,6,AS6,Z,12.1,8.7,10.6,27,244
ACURA,RLX,MID-SIZE,3.5,6,AS6,Z,11.9,7.7,10.0,28,230
ACURA,TL,MID-SIZE,3.5,6,AS6,Z,11.8,8.1,10.1,28,232
ACURA,TL AWD,MID-SIZE,3.7,6,AS6,Z,12.8,9.0,11.1,25,255
ACURA,TL AWD,MID-SIZE,3.7,6,M6,Z,13.4,9.5,11.6,24,267
ACURA,TSX,COMPACT,2.4,4,AS5,Z,10.6,7.5,9.2,31,212


In [0]:
co2.count()

In [0]:
co2 = co2.na.drop()

co2.count()

In [0]:
co2.select('CO2 Emissions(g/km)').display()

CO2 Emissions(g/km)
196
221
136
255
244
230
232
255
267
212


Output can only be rendered in Databricks

In [0]:
co2.select("Vehicle Class", "CO2 Emissions(g/km)").display()

Vehicle Class,CO2 Emissions(g/km)
COMPACT,196
COMPACT,221
COMPACT,136
SUV - SMALL,255
SUV - SMALL,244
MID-SIZE,230
MID-SIZE,232
MID-SIZE,255
MID-SIZE,267
COMPACT,212


Output can only be rendered in Databricks

In [0]:
co2.select("Fuel Type", "CO2 Emissions(g/km)").display()

Fuel Type,CO2 Emissions(g/km)
Z,196
Z,221
Z,136
Z,255
Z,244
Z,230
Z,232
Z,255
Z,267
Z,212


Output can only be rendered in Databricks

In [0]:
co2.select("Engine Size", "CO2 Emissions(g/km)").display()

Engine Size,CO2 Emissions(g/km)
2.0,196
2.4,221
1.5,136
3.5,255
3.5,244
3.5,230
3.5,232
3.7,255
3.7,267
2.4,212


Output can only be rendered in Databricks

#####Feature Engineering

In [0]:
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder

In [0]:
stages = []

categoricalColumns = ['Make', 'Model', 'Vehicle Class', 'Transmission', 'Fuel Type']

In [0]:
for categoricalCol in categoricalColumns:
  
    stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + 'Index')
    
    encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], 
                            outputCols=[categoricalCol + "classVec"])
    
    stages += [stringIndexer, encoder]

In [0]:
numericCols = ['Engine Size', 'Cylinders', 'Fuel Consumption City', 'Fuel Consumption Hwy']

assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols

assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")

stages += [assembler]

In [0]:
stages

In [0]:
from pyspark.ml import Pipeline

pipeline = Pipeline(stages = stages)

pipelineModel = pipeline.fit(co2)

In [0]:
co2_transformed = pipelineModel.transform(co2)

co2_transformed.select('features', 'CO2 Emissions(g/km)').display()

features,CO2 Emissions(g/km)
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2130, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 9.9, 6.7))",196
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2110, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.4, 4.0, 11.2, 7.7))",221
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 1305, 2095, 2120, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 1.5, 4.0, 6.0, 5.8))",136
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 1903, 2093, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.7, 9.1))",255
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.1, 8.7))",244
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 931, 2094, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 11.9, 7.7))",230
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 2019, 2094, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 11.8, 8.1))",232
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 1519, 2094, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.7, 6.0, 12.8, 9.0))",255
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 1519, 2094, 2110, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.7, 6.0, 13.4, 9.5))",267
"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 960, 2095, 2130, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.4, 4.0, 10.6, 7.5))",212


In [0]:
co2_train, co2_test = co2_transformed.randomSplit([0.7, 0.3], seed = 0)

In [0]:
print("Training Dataset Count: " + str(co2_train.count()))

print("Test Dataset Count: " + str(co2_test.count()))

In [0]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor(labelCol="CO2 Emissions(g/km)", subsamplingRate=0.8, numTrees=5)

In [0]:
rfModel = rf.fit(co2_train)

In [0]:
print("Total numNodes = ", rfModel.totalNumNodes)

In [0]:
len(rfModel.trees)

In [0]:
predictions = rfModel.transform(co2_test)

predictions.select("prediction", "CO2 Emissions(g/km)", "features").display()

prediction,CO2 Emissions(g/km),features
194.8664174848257,191,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2130, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 9.7, 6.7))"
194.8664174848257,192,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2126, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.4, 4.0, 9.4, 6.8))"
215.5474616186993,221,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2110, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.4, 4.0, 11.2, 7.7))"
261.60487297056113,259,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 373, 2093, 2124, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.6, 9.0))"
261.60487297056113,254,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 373, 2093, 2124, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.7, 9.1))"
244.0885681136373,261,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 910, 2100, 2133, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 11.1, 10.8))"
244.0885681136373,261,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 910, 2100, 2133, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 11.1, 10.8))"
234.3379044951624,232,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2118, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 11.0, 8.6))"
253.70634105981827,244,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.1, 8.7))"
253.70634105981827,250,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.4, 8.7))"


In [0]:
# evaluate model

from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol="CO2 Emissions(g/km)", predictionCol="prediction")

r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

In [0]:
print("R2 on test data = %g" % r2)

#####Gradient Boost Tree Regressor

In [0]:
from pyspark.ml.regression import GBTRegressor

gbt = GBTRegressor(labelCol="CO2 Emissions(g/km)", maxIter=50)

In [0]:
gbtModel = gbt.fit(co2_train)

In [0]:
predictions = gbtModel.transform(co2_test)

predictions.select("prediction", "CO2 Emissions(g/km)", "features").display()

prediction,CO2 Emissions(g/km),features
192.68923317084864,191,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2130, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 9.7, 6.7))"
191.9591163782553,192,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2126, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.4, 4.0, 9.4, 6.8))"
220.74437267897605,221,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 198, 2095, 2110, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.4, 4.0, 11.2, 7.7))"
256.8393339620507,259,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 373, 2093, 2124, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.6, 9.0))"
257.522254538986,254,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 373, 2093, 2124, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.7, 9.1))"
253.0167917280285,261,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 910, 2100, 2133, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 11.1, 10.8))"
253.0167917280285,261,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 910, 2100, 2133, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 11.1, 10.8))"
229.84147179516145,232,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2118, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 4.0, 11.0, 8.6))"
248.8740379565824,244,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.1, 8.7))"
248.8740379565824,250,"Map(vectorType -> sparse, length -> 2142, indices -> List(30, 296, 2093, 2108, 2135, 2138, 2139, 2140, 2141), values -> List(1.0, 1.0, 1.0, 1.0, 1.0, 3.5, 6.0, 12.4, 8.7))"


In [0]:
evaluator = RegressionEvaluator(labelCol="CO2 Emissions(g/km)", predictionCol="prediction")

r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print("R2 on test data = %g" % r2)

In [0]:
#observe residuals
import pyspark.sql.functions as F

predictions_with_residuals = predictions.withColumn("residual", (F.col("CO2 Emissions(g/km)") - F.col("prediction")))

display(predictions_with_residuals.agg({'residual': 'mean'}))

avg(residual)
-0.1860782076269928


In [0]:
display(predictions_with_residuals.select("Make", "residual"))

Make,residual
ACURA,-1.689233170848638
ACURA,0.0408836217447401
ACURA,0.255627321023951
ACURA,2.160666037949284
ACURA,-3.522254538985976
ACURA,7.983208271971478
ACURA,7.983208271971478
ACURA,2.1585282048385466
ACURA,-4.874037956582356
ACURA,1.125962043417644


Output can only be rendered in Databricks

In [0]:
display(predictions_with_residuals.select("residual"))

residual
-1.689233170848638
0.0408836217447401
0.255627321023951
2.160666037949284
-3.522254538985976
7.983208271971478
7.983208271971478
2.1585282048385466
-4.874037956582356
1.125962043417644


Output can only be rendered in Databricks