# Final Project Spark SQL Notebook

### Small Data

### Make all scripts executable

In [1]:
!!chmod a+x ./*/*.py

["chmod: changing permissions of './P1/spark_sql_p1.py': Operation not permitted",
 "chmod: changing permissions of './P2/spark_sql_p2.py': Operation not permitted",
 "chmod: changing permissions of './P3/spark_sql_p3.py': Operation not permitted",
 "chmod: changing permissions of './P4/spark_sql_p4.py': Operation not permitted",
 "chmod: changing permissions of './P5/spark_sql_p5.py': Operation not permitted"]

### Remove all Results

In [2]:
!rm -rf ./*/*.result

### P1

In [None]:
# %load ./P1/spark_sql_p1.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr : Row( state_name = arr[24], site_num = arr[2], state_code = arr[0], county_code = arr[1]))
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF.createOrReplaceTempView("log")

    query = query = "SELECT state_name, COUNT(*) AS number_monitors FROM (SELECT DISTINCT * FROM log) GROUP BY state_name ORDER BY number_monitors DESC"

    final = spark.sql(query)

    final.show(30)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()


In [4]:
!!time python ./P1/spark_sql_p1.py > ./P1/p1.result

['21/12/23 23:13:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 2:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 'real\t0m16.963s',
 'user\t0m1.172s',
 'sys\t0m0.907s']

In [5]:
!cat ./P1/p1.result

+--------------+---------------+
|    state_name|number_monitors|
+--------------+---------------+
|    California|            162|
|         Texas|            132|
|     Minnesota|             94|
|          Ohio|             89|
|      Michigan|             84|
|      New York|             66|
|South Carolina|             64|
|  Pennsylvania|             60|
|       Montana|             60|
|       Indiana|             52|
|      Colorado|             51|
|       Florida|             50|
|      Illinois|             50|
|North Carolina|             49|
|    Washington|             42|
|     Louisiana|             40|
|       Arizona|             38|
|        Kansas|             37|
|       Georgia|             34|
|        Oregon|             31|
|      Kentucky|             30|
|       Alabama|             28|
|     Tennessee|             27|
|    New Jersey|             24|
|     Wisconsin|             24|
|       Vermont|             22|
|   Mississipp

### P2

In [None]:
# %load ./P2/spark_sql_p2.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr : Row( State = arr[24], County = arr[25], countyCode = arr[1], Arithmetic_mean = float(arr[16])))
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF.createOrReplaceTempView("log")

    stateRanksDF = spark.sql("SELECT County, \
        AVG(Arithmetic_mean) AS Pollutant_levels \
        FROM log GROUP BY State, countyCode, County \
        ORDER BY Pollutant_levels DESC")
    stateRanksDF.show(100)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()


In [7]:
!!time python ./P2/spark_sql_p2.py > ./P2/p2.result

['21/12/23 23:13:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 2:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 'real\t0m16.632s',
 'user\t0m1.307s',
 'sys\t0m1.010s']

In [8]:
!cat ./P2/p2.result

+--------------------+------------------+
|              County|  Pollutant_levels|
+--------------------+------------------+
|              Tipton|            2556.0|
|              Nassau|              19.0|
|          Columbiana| 7.385690735785953|
|                Park| 5.611212121212121|
|     CHIHUAHUA STATE|         4.5121875|
|            Caldwell| 4.116666666666667|
|               Kings|3.9843770491803276|
|              Madera|            3.7393|
|            Franklin|3.3499999999999996|
|           Jefferson|              3.07|
|             Oakland| 2.888877848101266|
|                Lake| 2.879328647058823|
|               Duval|2.7794603978494625|
|           Middlesex|2.6500000000000004|
|              Kearny|2.3753333333333333|
|               Bucks|2.3674999999999997|
|     San Luis Obispo|2.3333333333333335|
|           Edgecombe|             2.325|
|              Pawnee|2.2941176470588234|
|         Westchester|          2.239375|
|          

### P3

In [None]:
# %load ./P3/spark_sql_p3.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr : Row( Year = arr[11][:4], State = arr[24], Arithmetic_mean = float(arr[16])))
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF.createOrReplaceTempView("log")

    stateRanksDF = spark.sql("SELECT Year, \
        State, \
        AVG(Arithmetic_mean) AS Pollutant_levels \
        FROM log \
        GROUP BY State, Year \
        ORDER BY Year, Pollutant_levels")
    stateRanksDF.show(100)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()

In [10]:
!!time python ./P3/spark_sql_p3.py > ./P3/p3.result

['21/12/23 23:14:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 2:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 'real\t0m17.521s',
 'user\t0m1.313s',
 'sys\t0m0.904s']

In [11]:
!cat ./P3/p3.result

+----+--------------------+--------------------+
|Year|               State|    Pollutant_levels|
+----+--------------------+--------------------+
|1990|           Wisconsin|                 0.0|
|1990|            Oklahoma|                 0.0|
|1990|      Virgin Islands|                 0.0|
|1990|       West Virginia|                 0.0|
|1990|              Hawaii|1.970370370370370...|
|1990|              Nevada|4.208000000000000...|
|1990|              Alaska|4.420833333333333...|
|1990|        South Dakota|            5.705E-4|
|1990|          Washington|5.974999999999999E-4|
|1990|             Wyoming|6.045454545454545E-4|
|1990|                Utah|7.970588235294118E-4|
|1990|          New Mexico|8.222222222222222E-4|
|1990|              Oregon|8.596296296296297E-4|
|1990|             Arizona|8.620134228187919E-4|
|1990|               Maine|9.789285714285713E-4|
|1990|            Colorado|0.002162374100719...|
|1990|         Mississippi|0.002666666666666...|


### P4

In [None]:
# %load ./P4/spark_sql_p4.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    # files
    lines_states = sc.textFile('../data/usa_states.csv')
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    
    # file mapping
    logRows_states = lines_states.filter( lambda line : len(line) > 0)    \
                    .zipWithIndex() \
                    .filter( lambda x: x[1] > 0) \
                    .map(lambda x: x[0]) \
                    .map( lambda line: line.split(',')) \
                    .map( lambda arr : Row( name = arr[1], centerLat = (float(arr[2])+float(arr[3]))/2, \
                                            centerLon = (float(arr[4])+float(arr[5]))/2))
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr: Row( stateName = arr[24], siteNum = arr[2], countyCode = arr[1], lat = float("{:.3f}".format(float(arr[5]))), lon = float("{:.3f}".format(float(arr[6]))) ) )  
    
    # Creates the dataframes and views
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF.createOrReplaceTempView("log")

    logRows_statesDF = spark.createDataFrame( logRows_states )
    logRows_statesDF.createOrReplaceTempView("log_states")

    query = "SELECT stateName as State, AVG(Dist_Monitor_Center) as Avg_Dist_Monitor_Center \
    FROM ( SELECT stateName, \
        sqrt( pow( (AVG(lat-centerLat))*111, 2) + pow( (AVG(lon-centerLon))*111, 2) ) as Dist_Monitor_Center \
        FROM log JOIN log_states ON stateName = name\
        GROUP BY stateName, countyCode, siteNum )\
    GROUP BY State"

    finalDF = spark.sql(query)

    
    finalDF.show(100)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()


In [13]:
!!time python ./P4/spark_sql_p4.py > ./P4/p4.result

['21/12/23 23:14:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 2) / 2]',
 '                                                                                ',
 '',
 '[Stage 1:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 4:>                  (0 + 4) / 4][Stage 5:>                  (0 + 2) / 2]',
 '[Stage 4:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '              

In [14]:
!cat ./P4/p4.result

+--------------+-----------------------+
|         State|Avg_Dist_Monitor_Center|
+--------------+-----------------------+
|          Utah|     184.91876919510136|
|        Hawaii|     155.72848584906387|
|     Minnesota|     195.06726533715846|
|          Ohio|     175.74265339396345|
|      Arkansas|     151.13713852810923|
|        Oregon|      270.4530697644639|
|         Texas|      512.0338063321824|
|  North Dakota|     248.43249153972548|
|  Pennsylvania|     250.65578227993026|
|   Connecticut|      49.99224954412992|
|      Nebraska|     307.13572198974225|
|       Vermont|      521.9872635614976|
|        Nevada|     325.85331502151854|
|   Puerto Rico|     32.733405599511656|
|    Washington|     223.06324162734512|
|      Illinois|     435.24508277080764|
|      Oklahoma|     236.71168497753382|
|Virgin Islands|      78.42453456606204|
|      Delaware|      51.57775481518227|
|        Alaska|      603.7055510312542|
|    New Mexico|     183.104816608

### P5

In [None]:
# %load ./P5/spark_sql_p5.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    # files
    lines_states = sc.textFile('../data/usa_states.csv')
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    
    # file mapping
    logRows_states = lines_states.filter( lambda line : len(line) > 0)    \
                    .zipWithIndex() \
                    .filter( lambda x: x[1] > 0) \
                    .map(lambda x: x[0]) \
                    .map( lambda line: line.split(',')) \
                    .map( lambda arr : Row( state = arr[0], name = arr[1], centerLat = (float(arr[2]) + float(arr[3]))/2, \
                                            centerLon = (float(arr[4]) + float(arr[5]))/2))
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr: Row( name = arr[24], countyCode = arr[1], stateNUM = arr[2], lat = float("{:.3f}".format(float(arr[5]))), lon = float("{:.3f}".format(float(arr[6]))) ) )  
    
    # Creates the dataframes and views
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF.createOrReplaceTempView("log")

    logRows_statesDF = spark.createDataFrame( logRows_states )
    logRows_statesDF.createOrReplaceTempView("log_states")
    
    # Atribui a cada monitor único o seu quadrante
    MonitorDF = spark.sql("SELECT log.name, log.countyCode, log.stateNum, \
     CASE \
         WHEN log.lat < log_states.centerLat AND log.lon > log_states.centerLon THEN 'NE' \
         WHEN log.lat > log_states.centerLat AND log.lon > log_states.centerLon THEN 'SE' \
         WHEN log.lat > log_states.centerLat AND log.lon < log_states.centerLon THEN 'SW' \
         WHEN log.lat < log_states.centerLat AND log.lon < log_states.centerLon THEN 'NW' \
         ELSE 'Center or Borders' \
     END AS Quadrant \
     FROM log JOIN log_states ON log.name=log_states.name GROUP BY log.name, log.countyCode, log.stateNum, Quadrant")
    MonitorDF.createOrReplaceTempView("Monitor")

    # Conta o Nr. de monitores em cada quadrante por estado
    finalDF = spark.sql("SELECT name AS State, Quadrant, count(*) AS Num_Monitors  FROM Monitor GROUP BY name, Quadrant")
    
    finalDF.show(100)

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()

In [16]:
!!time python ./P5/spark_sql_p5.py > ./P5/p5.result

['21/12/23 23:15:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 2) / 2]',
 '                                                                                ',
 '',
 '[Stage 1:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 4:>                  (0 + 2) / 2][Stage 5:>                  (0 + 4) / 4]',
 '[Stage 5:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '              

In [17]:
!cat ./P5/p5.result

+--------------+--------+------------+
|         State|Quadrant|Num_Monitors|
+--------------+--------+------------+
|          Utah|      SW|           6|
|          Utah|      NE|           3|
|          Utah|      NW|           3|
|        Hawaii|      SW|           2|
|        Hawaii|      SE|           2|
|        Hawaii|      NE|           1|
|     Minnesota|      NW|          21|
|     Minnesota|      SE|          11|
|     Minnesota|      NE|          50|
|     Minnesota|      SW|          12|
|          Ohio|      SE|          30|
|          Ohio|      NE|          10|
|          Ohio|      SW|          15|
|          Ohio|      NW|          34|
|      Arkansas|      SW|           3|
|      Arkansas|      NW|           4|
|      Arkansas|      SE|           2|
|      Arkansas|      NE|           1|
|        Oregon|      NW|          12|
|        Oregon|      SE|           3|
|        Oregon|      SW|          15|
|        Oregon|      NE|           1|
