# Spark DataFrames
- look like pandas datframes
- share some of the same methods and syntax
- but they are 2 seperate types of objects

Create spark session

In [4]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

Create dataframes

convert pandas dataframe into a spark dataframe

create pandas dataframe

In [7]:
import pandas as pd 
import numpy as np

pd_df = pd.DataFrame([['r1ci', 'r1c2'],
                     ['r2c1', 'r2c2'],
                     ['r3c1', 'r3c2']],
                    index = [1,2,3],
                    columns = ['col1', 'col2'])

np.random.seed(456)

create spark dataframe from pandas dataframe

In [10]:
sp_df = spark.createDataFrame(pd_df)

sp_df

DataFrame[col1: string, col2: string]

In [11]:
pd_df

Unnamed: 0,col1,col2
1,r1ci,r1c2
2,r2c1,r2c2
3,r3c1,r3c2


- we do see the column names, bit we dont see the data why?
- because spark is lazy, in that it wont show us values until it has to
- to peek, use. show
- .show defaults to 20

In [12]:
sp_df.show(2)

+----+----+
|col1|col2|
+----+----+
|r1ci|r1c2|
|r2c1|r2c2|
+----+----+
only showing top 2 rows



let's use a dataset with more realistic looking data to explore

In [13]:
from pydataset import data

mpg_pd = data('mpg')
mpg_pd.head(5)

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [15]:
mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



# Columns
- the followung will create a series from a pandas datafrme, but a column object from a spark dataframe
- A column object represents s vertical slice of a dataframe , but doe snot contsin the data itself
- you will use it to perform function on and referrence that column

In [16]:
mpg_pd.year

1      1999
2      1999
3      2008
4      2008
5      1999
       ... 
230    2008
231    2008
232    1999
233    1999
234    2008
Name: year, Length: 234, dtype: int64

In [18]:
# a column object is a slice of a dataframe but doesnt have the data itself
# like a bookmark for the data
mpg.year

Column<b'year'>

Select column

In [22]:
mpg.select(mpg.hwy, mpg.cty, mpg.model).show()

+---+---+------------------+
|hwy|cty|             model|
+---+---+------------------+
| 29| 18|                a4|
| 29| 21|                a4|
| 31| 20|                a4|
| 30| 21|                a4|
| 26| 16|                a4|
| 26| 18|                a4|
| 27| 18|                a4|
| 26| 18|        a4 quattro|
| 25| 16|        a4 quattro|
| 28| 20|        a4 quattro|
| 27| 19|        a4 quattro|
| 25| 15|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 17|        a4 quattro|
| 25| 15|        a4 quattro|
| 24| 15|        a6 quattro|
| 25| 17|        a6 quattro|
| 23| 16|        a6 quattro|
| 20| 14|c1500 suburban 2wd|
| 15| 11|c1500 suburban 2wd|
+---+---+------------------+
only showing top 20 rows



In [23]:
mpg.hwy + 1

Column<b'(hwy + 1)'>

In [24]:
mpg.select(mpg.hwy, mpg.hwy + 1).show(2)

+---+---------+
|hwy|(hwy + 1)|
+---+---------+
| 29|       30|
| 29|       30|
+---+---------+
only showing top 2 rows



Once we have a column object, we can use the .alias method to rename it

In [29]:
mpg.select(mpg.hwy.alias('highway_mileage'), 
           (mpg.hwy + 1).alias('hwy_mileage_plus1')).show(2)

+---------------+-----------------+
|highway_mileage|hwy_mileage_plus1|
+---------------+-----------------+
|             29|               30|
|             29|               30|
+---------------+-----------------+
only showing top 2 rows



We can alsdo store column object in variables and referrence them

In [30]:
col1 = mpg.hwy.alias("highway_mileage")
col2 = (mpg.hwy / 2).alias("highway_mileage_halved")
mpg.select(col1, col2).show(5)

+---------------+----------------------+
|highway_mileage|highway_mileage_halved|
+---------------+----------------------+
|             29|                  14.5|
|             29|                  14.5|
|             31|                  15.5|
|             30|                  15.0|
|             26|                  13.0|
+---------------+----------------------+
only showing top 5 rows



In addition to the syntax we've seen above, we can create columns with the col and expr functions from pyspark.sql.functions module

**col**

In [32]:
from pyspark.sql.functions import col, expr
col("hwy")

Column<b'hwy'>

In [34]:
col("class")

Column<b'class'>

The column object produced by the col function is the same as the previous column object we saw.

In [35]:
avg_column = (col('hwy') + col('cty')) / 2
avg_column

Column<b'((hwy + cty) / 2)'>

In [36]:
mpg.select(
col("hwy").alias("highway_mileage"),
mpg.cty.alias('city_mileage'),
avg_column.alias("avg_mileage"),
).show(5)

+---------------+------------+-----------+
|highway_mileage|city_mileage|avg_mileage|
+---------------+------------+-----------+
|             29|          18|       23.5|
|             29|          21|       25.0|
|             31|          20|       25.5|
|             30|          21|       25.5|
|             26|          16|       21.0|
+---------------+------------+-----------+
only showing top 5 rows



avg mileage is created by using the col function to produce pyspark Column objects and using the arithmetic operators to combine them.

**expr**

- Does everything col does and more
- Returns the same type of column object
- But alos allows us to express manipulations to the columns within the string that defines the column

In [37]:
mpg.select(
expr("hwy"),
expr("hwy + 1"),
expr("hwy AS highway_mileage"),
expr("hwy + 1 AS highway_incremented"),
).show(5)

+---+---------+---------------+-------------------+
|hwy|(hwy + 1)|highway_mileage|highway_incremented|
+---+---------+---------------+-------------------+
| 29|       30|             29|                 30|
| 29|       30|             29|                 30|
| 31|       32|             31|                 32|
| 30|       31|             30|                 31|
| 26|       27|             26|                 27|
+---+---------+---------------+-------------------+
only showing top 5 rows



Note that all the columns created below  are identical, and whoch syntax to use is merely a style choice

In [None]:
mpg.select(
mpg.hwy.alias("highway"),
col('hwy').aslias('highway'),
    expr('hwy').alias('highway'),
    expr('hw')

)

# Spark SQL
- spark sql allows us to write SQL queries against ourt spark dataframes
- we'll first *register* the tablke with spark with sp_df.createOrReplaceTempView('sp_df')

In [38]:
mpg.createOrReplaceTempView("mpg")

- Now we can write sql query against the mpg table

In [39]:
spark.sql(
"""
SELECT hwy, cty, (hwy + cty) / 2 AS avg
FROM mpg
"""
)

DataFrame[hwy: bigint, cty: bigint, avg: double]

- the resulting value is another dataframe
- to see the values, we have to...

In [40]:
spark.sql(
"""
SELECT hwy, cty, (hwy + cty) / 2 AS avg
FROM mpg
"""
).show()

+---+---+----+
|hwy|cty| avg|
+---+---+----+
| 29| 18|23.5|
| 29| 21|25.0|
| 31| 20|25.5|
| 30| 21|25.5|
| 26| 16|21.0|
| 26| 18|22.0|
| 27| 18|22.5|
| 26| 18|22.0|
| 25| 16|20.5|
| 28| 20|24.0|
| 27| 19|23.0|
| 25| 15|20.0|
| 25| 17|21.0|
| 25| 17|21.0|
| 25| 15|20.0|
| 24| 15|19.5|
| 25| 17|21.0|
| 23| 16|19.5|
| 20| 14|17.0|
| 15| 11|13.0|
+---+---+----+
only showing top 20 rows



# Type Casting
view column datatypes:

In [41]:
mpg.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

In [42]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



to convert from one type to another use the ```.cast``` method on a column

In [43]:
mpg.select(mpg.hwy.cast("string")).printSchema()

root
 |-- hwy: string (nullable = true)



If value is not able to be converted, ti will be replace with null:

In [46]:
mpg.select((mpg.model), mpg.model.cast("int")).show(5)

+-----+-----+
|model|model|
+-----+-----+
|   a4| null|
|   a4| null|
|   a4| null|
|   a4| null|
|   a4| null|
+-----+-----+
only showing top 5 rows



# Basic built-in function
There are many other functions beyond cola dn expr within the pyspark.sql.functions module for operating on pyspark dataframe columns.
- ```concat```: to concatenate astring
- ```sum```: toi suma. group
- ```avg```: to find the average of a group
- ```min``` to find the minnimum
- ```max```: to find the maximum

**Note**: importing tyhe sium,min and max functions directly will override the buitl in sum, min and max functions. This means you will get an error if you try to suma. list of numbers, becaue sum will referrance pyspark function, which works with pyspark dataframe columns, while the relative built-in function works with lists of numbers

In [47]:
# Note pyspark avg and mean functions are aliases of each other

from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

Its very common to see something like

```import pyspark.sql.functions as F```

which will imprt all functions from the pyspark.sql.functions module`

In [49]:
mpg.select(
    (sum(mpg.hwy) / count(mpg.hwy)).aslias('average_1'),
    avg(mpg.hwy).alias("average_2"),
    min(mpg.hwy),
    max(mpg.hwy)

)

TypeError: 'Column' object is not callable

In [48]:
mpg.select(concat(mpg.manufacturer, mpg.model)).show(5)

+---------------------------+
|concat(manufacturer, model)|
+---------------------------+
|                     audia4|
|                     audia4|
|                     audia4|
|                     audia4|
|                     audia4|
+---------------------------+
only showing top 5 rows



In [51]:
from pyspark.sql.functions import lit
mpg.select(concat(mpg.cyl, lit("cyclinders"))).show(5)

+-----------------------+
|concat(cyl, cyclinders)|
+-----------------------+
|            4cyclinders|
|            4cyclinders|
|            4cyclinders|
|            4cyclinders|
|            6cyclinders|
+-----------------------+
only showing top 5 rows



In [52]:
mpg.filter(mpg.cyl == 4).where(mpg['class'] == "subcompact").show()

+------------+-----------+-----+----+---+----------+---+---+---+---+----------+
|manufacturer|      model|displ|year|cyl|     trans|drv|cty|hwy| fl|     class|
+------------+-----------+-----+----+---+----------+---+---+---+---+----------+
|       honda|      civic|  1.6|1999|  4|manual(m5)|  f| 28| 33|  r|subcompact|
|       honda|      civic|  1.6|1999|  4|  auto(l4)|  f| 24| 32|  r|subcompact|
|       honda|      civic|  1.6|1999|  4|manual(m5)|  f| 25| 32|  r|subcompact|
|       honda|      civic|  1.6|1999|  4|manual(m5)|  f| 23| 29|  p|subcompact|
|       honda|      civic|  1.6|1999|  4|  auto(l4)|  f| 24| 32|  r|subcompact|
|       honda|      civic|  1.8|2008|  4|manual(m5)|  f| 26| 34|  r|subcompact|
|       honda|      civic|  1.8|2008|  4|  auto(l5)|  f| 25| 36|  r|subcompact|
|       honda|      civic|  1.8|2008|  4|  auto(l5)|  f| 24| 36|  c|subcompact|
|       honda|      civic|  2.0|2008|  4|manual(m6)|  f| 21| 29|  p|subcompact|
|     hyundai|    tiburon|  2.0|1999|  4

# conditional assigning of values

- specify a condition, and a value to prpduce if that condition is true

In [53]:
from pyspark.sql.functions import when

mpg.select(mpg.hwy, when(mpg.hwy > 25, "good_mileage").alias("mpg_desc")).show(12)

+---+------------+
|hwy|    mpg_desc|
+---+------------+
| 29|good_mileage|
| 29|good_mileage|
| 31|good_mileage|
| 30|good_mileage|
| 26|good_mileage|
| 26|good_mileage|
| 27|good_mileage|
| 26|good_mileage|
| 25|        null|
| 28|good_mileage|
| 27|good_mileage|
| 25|        null|
+---+------------+
only showing top 12 rows



- if the condition we specified is false, null will be rpoduced
- Use the ```.otherwise``` to fill the null value

In [55]:
mpg.select(mpg.hwy, when(mpg.hwy > 25, "good_mileage")
           .otherwise("bad_mileage")
           .alias("mpg_desc")).show(12)

+---+------------+
|hwy|    mpg_desc|
+---+------------+
| 29|good_mileage|
| 29|good_mileage|
| 31|good_mileage|
| 30|good_mileage|
| 26|good_mileage|
| 26|good_mileage|
| 27|good_mileage|
| 26|good_mileage|
| 25| bad_mileage|
| 28|good_mileage|
| 27|good_mileage|
| 25| bad_mileage|
+---+------------+
only showing top 12 rows



- to specify miltiple confitions , wecan chain ```.when``` calls.
- the first condition that is met will be the value that is used
- if none. of the conditions are met the value specided in the .otherwise will be used (or null if you dont provide a .otherwise)

In [56]:
mpg.select(
mpg.displ,
(when(mpg.displ < 2, "small")
.when(mpg.displ < 3, "medium")
.otherwise("large")
.alias("engine_size")
),
).show(10)

+-----+-----------+
|displ|engine_size|
+-----+-----------+
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
|  2.0|     medium|
|  2.8|     medium|
|  2.8|     medium|
|  3.1|      large|
|  1.8|      small|
|  1.8|      small|
|  2.0|     medium|
+-----+-----------+
only showing top 10 rows



# sorting & ordering


In [57]:
mpg.sort(mpg.hwy).show(8)

+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|manufacturer|              model|displ|year|cyl|     trans|drv|cty|hwy| fl| class|
+------------+-------------------+-----+----+---+----------+---+---+---+---+------+
|        jeep| grand cherokee 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|       dodge|        durango 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|   suv|
|       dodge|ram 1500 pickup 4wd|  4.7|2008|  8|manual(m6)|  4|  9| 12|  e|pickup|
|       dodge|  dakota pickup 4wd|  4.7|2008|  8|  auto(l5)|  4|  9| 12|  e|pickup|
|        jeep| grand cherokee 4wd|  6.1|2008|  8|  auto(l5)|  4| 11| 14|  p|   suv|
|   chevrolet|    k1500 tahoe 4wd|  5.3|2008|  8|  auto(l4)|  4| 11| 14|  e|   suv|
|   chevrolet|    k1500 tahoe 4wd|  5.7|1999|  8|  auto(l4)|  4| 11| 15|  r|   suv|
+------------+-------------------+-----+----+---+----------+---+---+---+---+

In [59]:
from pyspark.sql.functions import asc, desc

mpg.sort(mpg.hwy.desc())
# is the same as
mpg.sort(col('hwy').desc())
# is the same as
mpg.sort(desc('hwy')).show(5)

+------------+----------+-----+----+---+----------+---+---+---+---+----------+
|manufacturer|     model|displ|year|cyl|     trans|drv|cty|hwy| fl|     class|
+------------+----------+-----+----+---+----------+---+---+---+---+----------+
|  volkswagen|new beetle|  1.9|1999|  4|manual(m5)|  f| 35| 44|  d|subcompact|
|  volkswagen|     jetta|  1.9|1999|  4|manual(m5)|  f| 33| 44|  d|   compact|
|  volkswagen|new beetle|  1.9|1999|  4|  auto(l4)|  f| 29| 41|  d|subcompact|
|      toyota|   corolla|  1.8|2008|  4|manual(m5)|  f| 28| 37|  r|   compact|
|       honda|     civic|  1.8|2008|  4|  auto(l5)|  f| 24| 36|  c|subcompact|
+------------+----------+-----+----+---+----------+---+---+---+---+----------+
only showing top 5 rows



In [60]:
mpg.sort(desc("class"), mpg.cyl.asc(), col("hwy").desc()).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-----+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-----+
|      subaru|      forester awd|  2.5|2008|  4|manual(m5)|  4| 20| 27|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|  auto(l4)|  4| 20| 26|  r|  suv|
|      subaru|      forester awd|  2.5|1999|  4|manual(m5)|  4| 18| 25|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|manual(m5)|  4| 19| 25|  p|  suv|
|      subaru|      forester awd|  2.5|1999|  4|  auto(l4)|  4| 18| 24|  r|  suv|
|      subaru|      forester awd|  2.5|2008|  4|  auto(l4)|  4| 18| 23|  p|  suv|
|      toyota|       4runner 4wd|  2.7|1999|  4|manual(m5)|  4| 15| 20|  r|  suv|
|      toyota|       4runner 4wd|  2.7|1999|  4|  auto(l4)|  4| 16| 20|  r|  suv|
|        jeep|grand cherokee 4wd|  3.0|2008|  6|  auto(l5)|  4| 17| 22|  d|  suv|
|      toyota|  

In [61]:
mpg.groupBy(mpg.cyl)
mpg.groupBy(col("cyl"))
mpg.groupBy("cyl")

<pyspark.sql.group.GroupedData at 0x117baa410>

In [62]:
mpg.groupBy(mpg.cyl).agg(avg(mpg.cty), avg(mpg.hwy)).show()

+---+------------------+-----------------+
|cyl|          avg(cty)|         avg(hwy)|
+---+------------------+-----------------+
|  4|21.012345679012345|28.80246913580247|
|  8|12.571428571428571|17.62857142857143|
|  5|              20.5|            28.75|
|  6| 16.21518987341772|22.82278481012658|
+---+------------------+-----------------+



In [63]:
mpg.groupBy('cyl', 'class').agg(avg(mpg.cty), avg(mpg.hwy)).show()

+---+----------+------------------+------------------+
|cyl|     class|          avg(cty)|          avg(hwy)|
+---+----------+------------------+------------------+
|  6|   compact|16.923076923076923|25.307692307692307|
|  4|subcompact|22.857142857142858| 30.80952380952381|
|  4|       suv|              18.0|             23.75|
|  8|    pickup|              11.8|              15.8|
|  6|subcompact|              17.0|24.714285714285715|
|  6|   minivan|              15.6|              22.2|
|  6|       suv|              14.5|              18.5|
|  8|   midsize|              16.0|              24.0|
|  8|       suv|12.131578947368421|16.789473684210527|
|  6|    pickup|              14.5|              17.9|
|  4|   midsize|              20.5|           29.1875|
|  4|   minivan|              18.0|              24.0|
|  8|subcompact|              14.8|              21.6|
|  4|    pickup|              16.0|20.666666666666668|
|  8|   2seater|              15.4|              24.8|
|  4|   co

In [64]:
mpg.rollup("cyl").count().sort("cyl").show()

+----+-----+
| cyl|count|
+----+-----+
|null|  234|
|   4|   81|
|   5|    4|
|   6|   79|
|   8|   70|
+----+-----+



In [65]:
mpg.rollup('cyl').agg(expr("avg(hwy)")).sort("cyl").show()

mpg.rollup("cyl").agg(avg(mpg.hwy)).sort("cyl").show()

+----+-----------------+
| cyl|         avg(hwy)|
+----+-----------------+
|null|23.44017094017094|
|   4|28.80246913580247|
|   5|            28.75|
|   6|22.82278481012658|
|   8|17.62857142857143|
+----+-----------------+

+----+-----------------+
| cyl|         avg(hwy)|
+----+-----------------+
|null|23.44017094017094|
|   4|28.80246913580247|
|   5|            28.75|
|   6|22.82278481012658|
|   8|17.62857142857143|
+----+-----------------+



In [66]:
mpg.rollup("cyl", "class").mean('hwy').sort(col("cyl"), col("class")).show()

+----+----------+------------------+
| cyl|     class|          avg(hwy)|
+----+----------+------------------+
|null|      null| 23.44017094017094|
|   4|      null| 28.80246913580247|
|   4|   compact|          29.46875|
|   4|   midsize|           29.1875|
|   4|   minivan|              24.0|
|   4|    pickup|20.666666666666668|
|   4|subcompact| 30.80952380952381|
|   4|       suv|             23.75|
|   5|      null|             28.75|
|   5|   compact|              29.0|
|   5|subcompact|              28.5|
|   6|      null| 22.82278481012658|
|   6|   compact|25.307692307692307|
|   6|   midsize| 26.26086956521739|
|   6|   minivan|              22.2|
|   6|    pickup|              17.9|
|   6|subcompact|24.714285714285715|
|   6|       suv|              18.5|
|   8|      null| 17.62857142857143|
|   8|   2seater|              24.8|
+----+----------+------------------+
only showing top 20 rows



In [67]:
mpg.crosstab("class", "cyl").show()

+----------+---+---+---+---+
| class_cyl|  4|  5|  6|  8|
+----------+---+---+---+---+
|   midsize| 16|  0| 23|  2|
|subcompact| 21|  2|  7|  5|
|   2seater|  0|  0|  0|  5|
|    pickup|  3|  0| 10| 20|
|   minivan|  1|  0| 10|  0|
|       suv|  8|  0| 16| 38|
|   compact| 32|  2| 13|  0|
+----------+---+---+---+---+



In [68]:
mpg.groupBy("class").pivot("cyl").mean("hwy").sort(col("class")).show()

+----------+------------------+----+------------------+------------------+
|     class|                 4|   5|                 6|                 8|
+----------+------------------+----+------------------+------------------+
|   2seater|              null|null|              null|              24.8|
|   compact|          29.46875|29.0|25.307692307692307|              null|
|   midsize|           29.1875|null| 26.26086956521739|              24.0|
|   minivan|              24.0|null|              22.2|              null|
|    pickup|20.666666666666668|null|              17.9|              15.8|
|subcompact| 30.80952380952381|28.5|24.714285714285715|              21.6|
|       suv|             23.75|null|              18.5|16.789473684210527|
+----------+------------------+----+------------------+------------------+



In [69]:
mpg.groupBy("class", "cyl").mean("hwy").sort(col("class"), col("cyl")).show()

+----------+---+------------------+
|     class|cyl|          avg(hwy)|
+----------+---+------------------+
|   2seater|  8|              24.8|
|   compact|  4|          29.46875|
|   compact|  5|              29.0|
|   compact|  6|25.307692307692307|
|   midsize|  4|           29.1875|
|   midsize|  6| 26.26086956521739|
|   midsize|  8|              24.0|
|   minivan|  4|              24.0|
|   minivan|  6|              22.2|
|    pickup|  4|20.666666666666668|
|    pickup|  6|              17.9|
|    pickup|  8|              15.8|
|subcompact|  4| 30.80952380952381|
|subcompact|  5|              28.5|
|subcompact|  6|24.714285714285715|
|subcompact|  8|              21.6|
|       suv|  4|             23.75|
|       suv|  6|              18.5|
|       suv|  8|16.789473684210527|
+----------+---+------------------+

