# Q1 - Which states have more/less monitors? (Rank states!)

In [1]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('epa_hap_daily_summary-small.csv')
    l2 = lines.filter(lambda line: len(line) > 0)
    header = l2.first() #extract header
    #filtrar o header
    non_empty_lines = l2.filter(lambda row : row != header) 
    l3 = non_empty_lines.map(lambda line: line.split(','))
    logRows = l3.map(lambda arr: Row(state=arr[0],county_code=arr[1],site_num=arr[2], coord=arr[5]+arr[6]))
    logRowsDF = spark.createDataFrame(logRows)
    # logRowsDF.printSchema()
    #https://stackoverflow.com/questions/46421677/how-to-count-unique-id-after-groupby-in-pyspark
    #logRows2DF = logRowsDF.select("state","site_num").groupBy("state").agg(countDistinct("site_num").alias("count")).orderBy("count",ascending=False)
    logRows2DF = logRowsDF.select("state","coord").groupBy("state").agg(countDistinct("coord").alias("count")).orderBy("count",ascending=False)
    logRows2DF.show(100)

    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

+-----+-----+
|state|count|
+-----+-----+
|   06|  170|
|   48|  133|
|   27|   94|
|   26|   92|
|   39|   91|
|   36|   67|
|   45|   64|
|   30|   62|
|   42|   61|
|   12|   55|
|   18|   52|
|   08|   51|
|   37|   50|
|   17|   49|
|   53|   43|
|   22|   41|
|   04|   38|
|   20|   37|
|   13|   35|
|   21|   34|
|   41|   32|
|   01|   31|
|   47|   29|
|   55|   26|
|   34|   24|
|   50|   23|
|   40|   22|
|   28|   21|
|   23|   21|
|   25|   19|
|   51|   19|
|   29|   18|
|   35|   18|
|   19|   18|
|   80|   18|
|   16|   17|
|   33|   17|
|   24|   17|
|   09|   15|
|   44|   13|
|   02|   12|
|   49|   12|
|   05|   11|
|   54|   10|
|   32|    9|
|   56|    9|
|   38|    7|
|   46|    7|
|   72|    6|
|   31|    6|
|   10|    6|
|   78|    6|
|   11|    5|
|   15|    5|
+-----+-----+



# Q2 - Which counties have the best/worst air quality? (Rank counties considering pollutants’ level!)

In [2]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('epa_hap_daily_summary-small.csv')
    l2 = lines.filter(lambda line: len(line) > 0)
    header = l2.first() #extract header
    #filtrar o header
    non_empty_lines = l2.filter(lambda row : row != header) 
    l3 = non_empty_lines.map(lambda line: line.split(','))
    #logRows = l3.map(lambda arr: Row(StateCounty=arr[0]+ "|" + arr[1],site_num=arr[2], polutionAvg = float(arr[16])))
    logRows = l3.map(lambda arr: Row(State = arr[0], County=arr[1],site_num=arr[2], polutionAvg = float(arr[16])))
    logRowsDF = spark.createDataFrame(logRows)
    # logRowsDF.printSchema()
    logRows2DF = logRowsDF.select("State", "County","polutionAvg").groupBy("State", "County").avg("polutionAvg").orderBy("avg(polutionAvg)",ascending=False).select(col("State"),col("County"), col("avg(polutionAvg)").alias("Average Polution"))
    logRows2DF.show(1000)

    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

+-----+------+--------------------+
|State|County|    Average Polution|
+-----+------+--------------------+
|   47|   167|              2556.0|
|   36|   059|                19.0|
|   39|   029|   7.385690735785953|
|   30|   067|   5.611212121212121|
|   80|   006|           4.5121875|
|   37|   027|   4.116666666666667|
|   06|   031|  3.9843770491803276|
|   06|   039|              3.7393|
|   37|   069|  3.3499999999999996|
|   08|   059|                3.07|
|   26|   125|   2.888877848101266|
|   17|   097|   2.879328647058823|
|   12|   031|  2.7794603978494625|
|   25|   017|  2.6500000000000004|
|   20|   093|  2.3753333333333333|
|   42|   017|  2.3674999999999997|
|   06|   079|  2.3333333333333335|
|   37|   065|               2.325|
|   20|   145|  2.2941176470588234|
|   36|   119|            2.239375|
|   37|   101|               2.225|
|   09|   003|  2.0787055896226416|
|   37|   077|  2.0285714285714285|
|   53|   003|               2.025|
|   37|   061|              

# Q3 - Which states have the best/worst air quality in each year? (Rank states per year considering pollutants' levels!)

In [6]:
%%time
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext


try:
    epa_daily = sc.textFile('epa_hap_daily_summary-small.csv')

    non_empty_lines_epa_daily = epa_daily.filter( lambda line : len(line) > 0 )

    header_epa_daily = non_empty_lines_epa_daily.first() #extract header
    non_empty_lines_epa_daily = non_empty_lines_epa_daily.filter(lambda row : row != header_epa_daily)  

    epa_daily = non_empty_lines_epa_daily.map(lambda line: line.split(','))
    
    #cria tuplos (ano,(estado,ar))
    epa_daily=epa_daily.map(lambda coord: Row(year = coord[11].split('-')[0], state = coord[24], airq = float(coord[16])))
    
    epa_dailyDF = spark.createDataFrame(epa_daily)
    
    epa_dailyDF = epa_dailyDF.groupBy('year','state').agg(round(avg(col('airq')),6).alias('AveragePolution'))
    epa_dailyDF = epa_dailyDF.orderBy('year','AveragePolution')
    
    epa_dailyDF.show(100)
    
    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

+----+--------------------+---------------+
|year|               state|AveragePolution|
+----+--------------------+---------------+
|1990|           Wisconsin|            0.0|
|1990|      Virgin Islands|            0.0|
|1990|            Oklahoma|            0.0|
|1990|       West Virginia|            0.0|
|1990|              Hawaii|        1.97E-4|
|1990|              Nevada|        4.21E-4|
|1990|              Alaska|        4.42E-4|
|1990|        South Dakota|        5.71E-4|
|1990|          Washington|        5.97E-4|
|1990|             Wyoming|        6.05E-4|
|1990|                Utah|        7.97E-4|
|1990|          New Mexico|        8.22E-4|
|1990|              Oregon|         8.6E-4|
|1990|             Arizona|        8.62E-4|
|1990|               Maine|        9.79E-4|
|1990|            Colorado|       0.002162|
|1990|         Mississippi|       0.002667|
|1990|            Missouri|         0.0056|
|1990|            Michigan|        0.00656|
|1990|         Connecticut|     

# Q4 - For each state, what is the average distance (in km) of the monitors in that state to the state center? For simplicity, assume that 1 degree of latitude or logitude equals to 111 km. (Monitor dispersion per state!)

In [4]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
import math

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext
try:
    epa_daily = sc.textFile('epa_hap_daily_summary-small.csv')
    usa_state = sc.textFile('usa_states.csv')

    non_empty_lines_epa_daily = epa_daily.filter( lambda line : len(line) > 0 )
    non_empty_lines_usa_state = usa_state.filter( lambda line : len(line) > 0 )

    header_epa_daily = non_empty_lines_epa_daily.first() #extract header
    header_usa_state = non_empty_lines_usa_state.first() #extract header

    non_empty_lines_epa_daily = non_empty_lines_epa_daily.filter(lambda row : row != header_epa_daily)  
    non_empty_lines_usa_state = non_empty_lines_usa_state.filter(lambda row : row != header_usa_state)  

    epa_daily = non_empty_lines_epa_daily.map(lambda line: line.split(','))
    usa_state = non_empty_lines_usa_state.map(lambda line: line.split(','))
    
    table_epa_daily=epa_daily.map(lambda arr: Row(state=arr[24],lat=float(arr[5])*111, long=float(arr[6])*111))
    table_usa_state=usa_state.map(lambda arr: Row(state=arr[1],lat_center=(float(arr[2])*111+float(arr[3])*111)/2, long_center=(float(arr[4])*111+float(arr[5])*111)/2))
    
    table_epa_daily = spark.createDataFrame(table_epa_daily)
    table_usa_state = spark.createDataFrame(table_usa_state)
    
    table_epa_daily = table_epa_daily.distinct()
    
    table = table_epa_daily.join(table_usa_state,['state'])
    
    distance_monitor_state=table.select(col('state'),(((col('lat')-col('lat_center'))**2)+((col('long')-col('long_center'))**2)).alias('distancia'))

    avg_dist =distance_monitor_state.groupBy('state').agg(avg(sqrt(col("distancia"))).alias("avgDistance"))
    avg_dist=avg_dist.sort(col("avgDistance").desc(),col("state"))
    avg_dist.show()

    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

+------------+------------------+
|       state|       avgDistance|
+------------+------------------+
|    Virginia| 715.4329730260437|
|      Alaska| 603.6996422410685|
|       Texas| 512.1839891630145|
|     Vermont| 504.0632354850801|
|    Illinois| 440.8540143212859|
|South Dakota|365.84513743232594|
|     Florida| 336.5449146570827|
|  California| 328.2263813155371|
|    Michigan| 326.4116064851137|
|      Nevada|326.28118071973887|
|    Nebraska| 307.1411826055286|
|      Kansas|292.07968412967074|
|       Idaho| 289.6350732756307|
|     Montana| 286.8383352756939|
|    New York| 283.7273398637171|
|     Wyoming|283.64058633747493|
|      Oregon| 268.8538079232643|
|Pennsylvania| 251.4151763405716|
|North Dakota| 248.4219307326271|
|    Oklahoma| 236.8825743729829|
+------------+------------------+
only showing top 20 rows



# Q5 - How many sensors there are per quadrant (NW, NE, SE, SW) in each state? To answer this question, you should approximate each state’s area to a rectangle as defined in the file “usa_satates.csv”, and divide that area in 4 quadrants (NW, NE, SE, SW). (Count monitors per sate qudrant!)

In [8]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:

    ########## Ter USA States com latitude e longitude medias ########## 

    usaStates = sc.textFile('usa_states.csv')
    
    non_empty_linesStates = usaStates.filter( lambda line : len(line) > 0 )    
    header = non_empty_linesStates.first() #extract header
    #filtrar o header
    non_empty_linesStates = non_empty_linesStates.filter(lambda row : row != header)  
    wordsUSA = non_empty_linesStates.map( lambda line : line.split(','))

    linhasUSA = wordsUSA.map(lambda arr: Row(stateNameUSA=arr[1] ,avgLat = (float(arr[2])+float(arr[3]))/2 , avgLong = (float(arr[4])+float(arr[5]))/2))
    linhasUSADF = spark.createDataFrame(linhasUSA)
    linhasUSA2DF = linhasUSADF.select("stateNameUSA","avgLat", "avgLong")
    #linhasUSA2DF.show(1000)
    linhasUSA2DF.printSchema()

    ########## Ficheiro EPA ########## 

    epaFile = sc.textFile('epa_hap_daily_summary-small.csv')
    
    non_empty_linesEPA = epaFile.filter( lambda line : len(line) > 0 )    
    header = non_empty_linesEPA.first() #extract header
    #filtrar o header
    non_empty_linesEPA = non_empty_linesEPA.filter(lambda row : row != header)  
    wordsEPA = non_empty_linesEPA.map( lambda line : line.split(','))

    linhasEPA = wordsEPA.map(lambda arr: Row(stateName=arr[24] , lat = float(arr[5]) , long = float(arr[6])))
    linhasEPADF = spark.createDataFrame(linhasEPA)
    linhasEPADF.printSchema()

    linhasEPA2DF = linhasEPADF.distinct().join(linhasUSA2DF,linhasEPADF.stateName ==  linhasUSA2DF.stateNameUSA, "inner").orderBy("stateName").withColumn("Quadrant",\
                                                                                                                               when((col("lat") <= col("avgLat")) & (col("long") <= col("avgLong")),"NW").\
                                                                                                                                 when((col("lat") <= col("avgLat")) & (col("long") > col("avgLong")),"SW").\
                                                                                                                                   when((col("lat") > col("avgLat")) & (col("long") <= col("avgLong")),"NE").\
                                                                                                                                     when((col("lat") > col("avgLat")) & (col("long") > col("avgLong")),"SE").\
                                                                                                                                       otherwise("Unknown")).\
                                                                                                                                         drop("lat", "long", "stateNameUSA", "avgLat", "avgLong").\
                                                                                                                                           orderBy("stateName", "Quadrant").\
                                                                                                                                             select("stateName", "Quadrant").\
                                                                                                                                               groupBy("stateName", "Quadrant").\
                                                                                                                                                 count().\
                                                                                                                                                   orderBy("count", ascending=False)
    linhasEPA2DF.show(10000)

    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

root
 |-- stateNameUSA: string (nullable = true)
 |-- avgLat: double (nullable = true)
 |-- avgLong: double (nullable = true)

root
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- stateName: string (nullable = true)

+--------------+--------+-----+
|     stateName|Quadrant|count|
+--------------+--------+-----+
|    California|      NE|   84|
|      Michigan|      SW|   74|
|         Texas|      SW|   72|
|    California|      SW|   68|
|     Minnesota|      SW|   50|
|      New York|      SW|   43|
|          Ohio|      NW|   36|
|  Pennsylvania|      SW|   36|
|       Montana|      NW|   36|
|     Louisiana|      SW|   35|
|         Texas|      SE|   34|
|South Carolina|      NE|   33|
|      Illinois|      SE|   32|
|          Ohio|      SE|   30|
|       Florida|      SE|   27|
|North Carolina|      SE|   26|
|      Colorado|      SE|   25|
|         Texas|      NE|   24|
|       Florida|      SW|   23|
|       Georgia|      NE|   21|
|     Minnesota|   