In [2]:
import pyspark
from pyspark.sql import SparkSession

bucket = "e-drill"

spark = SparkSession.builder \
    .master("local") \
    .appName('jupyter-pyspark') \
        .config("spark.jars.packages","org.apache.hadoop:hadoop-aws:3.1.2")\
        .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
        .config("spark.hadoop.fs.s3a.access.key", "minio") \
        .config("spark.hadoop.fs.s3a.secret.key", "SU2orange!") \
        .config("spark.hadoop.fs.s3a.fast.upload", True) \
        .config("spark.hadoop.fs.s3a.path.style.access", True) \
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR") # Keeps the noise down!!!

In [15]:
spark.read.option("header",True)\
    .option("inferSchema",True)\
    .csv("s3a://labe/syracuse-ny.csv")\
    .createOrReplaceTempView("weather")
spark.sql("select * from weather").printSchema()

root
 |-- EST: string (nullable = true)
 |-- Max TemperatureF: integer (nullable = true)
 |-- Mean TemperatureF: integer (nullable = true)
 |-- Min TemperatureF: integer (nullable = true)
 |-- Max Dew PointF: integer (nullable = true)
 |-- MeanDew PointF: integer (nullable = true)
 |-- Min DewpointF: integer (nullable = true)
 |-- Max Humidity: integer (nullable = true)
 |-- Mean Humidity: integer (nullable = true)
 |-- Min Humidity: integer (nullable = true)
 |-- Max Sea Level PressureIn: double (nullable = true)
 |-- Mean Sea Level PressureIn: double (nullable = true)
 |-- Min Sea Level PressureIn: double (nullable = true)
 |-- Max VisibilityMiles: integer (nullable = true)
 |-- Mean VisibilityMiles: integer (nullable = true)
 |-- Min VisibilityMiles: integer (nullable = true)
 |-- Max Wind SpeedMPH: integer (nullable = true)
 |-- Mean Wind SpeedMPH: integer (nullable = true)
 |-- Max Gust SpeedMPH: integer (nullable = true)
 |-- PrecipitationIn: string (nullable = true)
 |-- CloudCo

In [13]:
query = '''
with source as(
    select 
    cast(split(EST,'-')[0] as int) as year,
    cast(split(EST,'-')[1] as int)as month,
    `Min TemperatureF` as mintemp,
    `Max TemperatureF` as maxtemp
    from weather
    )
select 
    year, month, avg(mintemp) as avgmin, 
    avg(maxtemp) as avgmax
    from source
    group by year, month
    order by year, month
'''
spark.sql(query).show()



+----+-----+------------------+------------------+
|year|month|            avgmin|            avgmax|
+----+-----+------------------+------------------+
|1997|    1|15.774193548387096| 31.64516129032258|
|1997|    2|22.607142857142858|37.785714285714285|
|1997|    3|25.032258064516128| 41.12903225806452|
|1997|    4| 34.43333333333333|              54.1|
|1997|    5|43.096774193548384| 61.58064516129032|
|1997|    6|              57.8|              78.4|
|1997|    7| 59.87096774193548| 80.19354838709677|
|1997|    8| 58.70967741935484| 78.38709677419355|
|1997|    9| 51.06666666666667|              69.4|
|1997|   10|38.935483870967744| 59.16129032258065|
|1997|   11|31.466666666666665|42.666666666666664|
|1997|   12|25.032258064516128|35.516129032258064|
|1998|    1|23.258064516129032| 35.41935483870968|
|1998|    2|24.142857142857142|              38.0|
|1998|    3|30.129032258064516|45.064516129032256|
|1998|    4|              38.0| 58.46666666666667|
|1998|    5| 52.45161290322581|

                                                                                

In [20]:
query = '''
with source as(
    select 
    cast(split(EST,'-')[0] as int) as year,
    cast(split(EST,'-')[1] as int)as month,
    `Min TemperatureF` as mintemp,
    `Max TemperatureF` as maxtemp
    from weather
    )
select 
    year, month, avg(mintemp) as avgmin, 
    avg(maxtemp) as avgmax
    from source
    group by year, month
    order by year, month
'''
spark.sql(query).createOrReplaceTempView("monthly_syracuse_weather_averages")
spark.sql(f"select * from monthly_syracuse_weather_averages where month =7").show()



+----+-----+------------------+-----------------+
|year|month|            avgmin|           avgmax|
+----+-----+------------------+-----------------+
|1997|    7| 59.87096774193548|80.19354838709677|
|1998|    7| 61.29032258064516|79.03225806451613|
|1999|    7|  64.3225806451613|85.74193548387096|
|2000|    7|57.774193548387096|76.51612903225806|
|2001|    7|              59.0|79.87096774193549|
|2002|    7| 63.61290322580645|84.16129032258064|
|2003|    7|61.483870967741936|81.29032258064517|
|2004|    7|60.903225806451616|78.16129032258064|
|2005|    7| 64.19354838709677|85.12903225806451|
|2006|    7|              65.0|83.12903225806451|
|2007|    7|59.483870967741936|80.19354838709677|
|2008|    7| 61.41935483870968|81.19354838709677|
|2009|    7|58.935483870967744|77.45161290322581|
|2010|    7| 64.38709677419355|84.64516129032258|
|2011|    7| 64.64516129032258|86.93548387096774|
|2012|    7| 64.54838709677419|87.90322580645162|
|2013|    7| 65.06451612903226|83.83870967741936|


                                                                                

In [18]:
month = input("Enter Month:")
spark.sql(f"select * from monthly_syracuse_weather_averages where month = {month}").show()

Enter Month: 9




+----+-----+------------------+-----------------+
|year|month|            avgmin|           avgmax|
+----+-----+------------------+-----------------+
|1997|    9| 51.06666666666667|             69.4|
|1998|    9|              54.4|             73.8|
|1999|    9|              54.8|75.76666666666667|
|2000|    9| 50.43333333333333|             71.2|
|2001|    9|52.166666666666664|             73.0|
|2002|    9| 56.06666666666667|             78.2|
|2003|    9|              53.6|             73.0|
|2004|    9| 55.43333333333333|74.96666666666667|
|2005|    9|              54.5|76.36666666666666|
|2006|    9|              52.0|             69.5|
|2007|    9|              53.8|76.86666666666666|
|2008|    9| 52.53333333333333|72.83333333333333|
|2009|    9|50.833333333333336|             71.8|
|2010|    9|53.766666666666666|             72.4|
|2011|    9|56.766666666666666|             75.5|
|2012|    9|52.733333333333334|74.53333333333333|
|2013|    9|              50.2|72.36666666666666|


                                                                                

In [26]:
from IPython.display import display,HTML
from ipywidgets import interact_manual
display(HTML("<h1>Pick a Month<h1>"))
@interact_manual(Month=(1,12))
def doit(Month):
    print(f"You selected: {Month}")

interactive(children=(IntSlider(value=6, description='Month', max=12, min=1), Button(description='Run Interact…

In [30]:
from IPython.display import display,HTML
from ipywidgets import interact_manual
import matplotlib.pyplot as plt
display(HTML("<h1>Syracuse Weather<h1>"))
@interact_manual(Month=(1,12))
def doit(Month):
    df = spark.sql(f"select * from monthly_syracuse_weather_averages where month = {Month}").toPandas()
    display(df)
    df.set_index("year")
    plt.scatter(df["year"], y = df["avgmin"], label = 'monthly avg min')
    plt.scatter(df["year"], y = df["avgmax"],label = 'monthly avg max')
    plt.legend(title =f"Temps for Month {Month}")
    plt.show()
    

interactive(children=(IntSlider(value=6, description='Month', max=12, min=1), Button(description='Run Interact…