In [3]:
from datetime import datetime
from pyspark.sql.functions import desc, explode, col
from pyspark import SparkConf, SparkContext
from pytz import timezone

ALL_MODELS = spark.read.json("s3a://edmundsvehicle/2017/*/*/*/*")

ALL_MODELS.cache()
ALL_YEARS = ALL_MODELS.select(ALL_MODELS['id'], explode(ALL_MODELS['years']))

ALL_YEARS = ALL_YEARS.withColumn("year_id", ALL_YEARS['col'].getField("id"))
ALL_YEARS = ALL_YEARS.withColumn("year", ALL_YEARS['col'].getField("year"))
ALL_YEARS = ALL_YEARS.withColumn("styles", ALL_YEARS['col'].getField("styles"))

ALL_STYLES = ALL_YEARS.select(ALL_YEARS['id'], explode(ALL_YEARS['styles']))

ALL_STYLES = ALL_STYLES.withColumn("trim_id", ALL_STYLES['col'].getField("id"))
ALL_STYLES = ALL_STYLES.withColumn("name", ALL_STYLES['col'].getField("name"))
ALL_STYLES = ALL_STYLES.withColumn("submodel", ALL_STYLES['col'].getField("submodel"))
ALL_STYLES = ALL_STYLES.withColumn("trim", ALL_STYLES['col'].getField("trim"))

sqlCtx.registerDataFrameAsTable(ALL_MODELS, "ALL_MODELS")
sqlCtx.registerDataFrameAsTable(ALL_YEARS, "ALL_YEARS")
sqlCtx.registerDataFrameAsTable(ALL_STYLES, "ALL_STYLES")

years_query = sqlCtx.sql("""SELECT m.id,
                     m.name,
                     y.year,
                     y.year_id
                    from ALL_MODELS m
                    INNER JOIN ALL_YEARS y on y.id = m.id
                    """)
styles_query = sqlCtx.sql("""SELECT m.id,
                     s.trim,
                     s.name as style_name,
                     submodel.body,
                    submodel.modelName as model_body_name
                    from ALL_MODELS m
                    INNER JOIN ALL_STYLES s on s.id = m.id
                    """)
APP_NAME = "Top 20 Vehicle Models by Years"


def main():
    try:
        # chart showing the models that have the most years
        top_models = years_query.groupBy(years_query['id']).count().sort(col("count").desc()).toPandas()

        # chart showing counts of body styles in the database
        body_style_counts = styles_query.groupBy(styles_query['body']).count().sort(col("count").desc()).toPandas()

        # get the current local time
        pacific = timezone('US/Pacific')
        pacific_time = datetime.now(pacific)
        top_html = body_style_counts.to_html() + top_models.to_html()

        html = "<!DOCTYPE html><html><body>{}</body></html>".format(top_html.encode('utf-8'))
        results = open("topVehicles.html", 'w')
        results.write("Last Updated: " + pacific_time.strftime('%A, %B, %d %Y %H:%M:%S') + " Pacific Time")
        results.write(
            "<hr>Hi!  This webpage shows counts of body styles in the database and the number of years for each "
            "model.<br>It serves as the front end of my Edmunds car data pipeline.  - Justin J. Wang<hr>")
        results.write(html)
        results.close()
        
    except Exception as e:
        print(e)

        
if __name__ == "__main__":
    # Configure spark
    conf = SparkConf().setAppName(APP_NAME)
    conf = conf.setMaster("local[*]")
main()