# Final Project Spark DataFrames Notebook

### Small Data

### Make all scripts executable

In [1]:
!!chmod a+x ./*/*.py

["chmod: changing permissions of './P1/spark_df_p1.py': Operation not permitted",
 "chmod: changing permissions of './P2/spark_df_p2.py': Operation not permitted",
 "chmod: changing permissions of './P3/spark_df_p3.py': Operation not permitted",
 "chmod: changing permissions of './P4/spark_df_p4.py': Operation not permitted",
 "chmod: changing permissions of './P5/spark_df_p5.py': Operation not permitted"]

### Remove all Results

In [2]:
!rm -rf ./*/*.result

### P1

In [3]:
# %load ./P1/spark_df_p1.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    # files
    lines_states = sc.textFile('../data/epa_hap_daily_summary-small.csv')
    #lines = sc.textFile('log.csv')

    # file mapping
    logRows = lines_states.filter( lambda line : len(line) > 0)    \
                    .zipWithIndex() \
                    .filter( lambda x: x[1] > 0) \
                    .map(lambda x: x[0]) \
                    .map( lambda line: line.split(',')) \
                    .map( lambda arr : Row( state = arr[24], countyCode = arr[1], site_num = arr[2]))    
    
    # Dataframe creation
    logRowsDF = spark.createDataFrame( logRows )
    #logRowsDF = spark.createDataFrame( logRows )
    #logRowsDF = logRowsDF.distinct() # Makes sure we are using different monitors
    
    logRows2DF = logRowsDF.select('state','countyCode','site_num').distinct().groupBy('state')\
                                                                         .agg(count('site_num').alias('Nr of Monitors'))\
                                                                         .sort('Nr of Monitors', ascending = False)   
    logRows2DF.show(100,truncate=50)

    sc.stop()
except Exception as e:
    print(e)
    sc.stop()


+--------------------+--------------+
|               state|Nr of Monitors|
+--------------------+--------------+
|          California|           162|
|               Texas|           132|
|           Minnesota|            94|
|                Ohio|            89|
|            Michigan|            84|
|            New York|            66|
|      South Carolina|            64|
|        Pennsylvania|            60|
|             Montana|            60|
|             Indiana|            52|
|            Colorado|            51|
|            Illinois|            50|
|             Florida|            50|
|      North Carolina|            49|
|          Washington|            42|
|           Louisiana|            40|
|             Arizona|            38|
|              Kansas|            37|
|             Georgia|            34|
|              Oregon|            31|
|            Kentucky|            30|
|             Alabama|            28|
|           Tennessee|            27|
|          N

In [4]:
!!time python ./P1/spark_df_p1.py > ./P1/p1.result

['21/12/22 17:20:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 2:>                                                          (0 + 4) / 4]',
 '[Stage 3:===>                                                    (13 + 9) / 200]',
 '                                                                                ',
 'real\t0m21.589s',
 'user\t0m1.296s',
 'sys\t0m1.014s']

In [5]:
!cat ./P1/p1.result

+--------------------+--------------+
|               state|Nr of Monitors|
+--------------------+--------------+
|          California|           162|
|               Texas|           132|
|           Minnesota|            94|
|                Ohio|            89|
|            Michigan|            84|
|            New York|            66|
|      South Carolina|            64|
|        Pennsylvania|            60|
|             Montana|            60|
|             Indiana|            52|
|            Colorado|            51|
|            Illinois|            50|
|             Florida|            50|
|      North Carolina|            49|
|          Washington|            42|
|           Louisiana|            40|
|             Arizona|            38|
|              Kansas|            37|
|             Georgia|            34|
|              Oregon|            31|
|            Kentucky|            30|
|             Alabama|            28|
|           Tennessee|   

### P2

In [None]:
# %load ./P2/spark_df_p2.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr : Row( county_name = arr[25], state_code = arr[0], county_code = arr[1], arithmetic_mean = float(arr[16])))
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF.createOrReplaceTempView("log")
    
    # Necessary computations to solve the problem
    finalDF = logRowsDF.withColumn('county', (col('state_code')+col('county_code'))) \
                    .drop('state_code') \
                    .drop('county_code') \
                    .groupBy('county','county_name').sum('arithmetic_mean') \
                    .withColumnRenamed('sum(arithmetic_mean)', 'pollutant_levels') \
                    .orderBy(col('pollutant_levels').desc()) \
                    .drop('county') \

    finalDF.show(20)
    sc.stop()
    
except Exception as err:
    print(err)
    sc.stop()

In [7]:
!!time python ./P2/spark_df_p2.py > ./P2/p2.result

['21/12/22 17:20:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 2:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 'real\t0m17.805s',
 'user\t0m1.240s',
 'sys\t0m0.938s']

In [8]:
!cat ./P2/p2.result

+--------------+------------------+
|   county_name|  pollutant_levels|
+--------------+------------------+
|        Harris| 16540.68687000023|
|         Wayne|11747.949338000006|
|   Los Angeles| 8833.365818000002|
|        Tipton|            5112.0|
|       El Paso| 4798.531819999982|
|          Lake|       4751.710437|
|          Cook| 3990.635898999992|
|     Jefferson| 3612.966027999982|
|  Philadelphia|       3073.943222|
|          Kern| 2715.872453999995|
|        Fresno|2287.9766509999945|
|    Columbiana|        2208.32153|
|    Providence|2192.0918579999993|
|     Riverside| 2142.845549999996|
|St. Louis City|2068.8462899999986|
|        Dallas|2013.5995819999957|
|     San Diego|1955.4988399999966|
|     Multnomah|1891.2379399999995|
|        DeKalb|1758.4259430000002|
|      Hennepin|1687.0018299999977|
+--------------+------------------+
only showing top 20 rows



### P3

In [9]:
# %load ./P3/spark_df_p3.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr : Row( Year = arr[11][:4], State = arr[24], Arithmetic_mean = float(arr[16])))
    
    logRowsDF = spark.createDataFrame( logRows )
    
    logRows2DF = logRowsDF.select('State','Year','Arithmetic_mean')\
                                                           .groupBy('Year','State')\
                                                           .agg((sum('Arithmetic_mean')/count('Arithmetic_mean')).alias('Avg Pollutants'))\
                                                           .sort('Avg Pollutants', ascending = False)
                                                                    
    
    logRows2DF.show(200,truncate=50)
    
    sc.stop()
except Exception as e:
    print(e)
    sc.stop()

+----+--------------------+------------------+
|Year|               State|    Avg Pollutants|
+----+--------------------+------------------+
|1990|           Tennessee|170.40093066666665|
|1995|   Country Of Mexico|              8.46|
|2001|            Michigan| 4.506138716367713|
|1993|       Massachusetts| 4.305833285714285|
|2017|            Colorado|4.2250000000000005|
|1990|             Indiana| 4.098978378378379|
|1992|            Illinois| 3.911825163398692|
|1994|       Massachusetts|3.4609906122448977|
|1995|           Louisiana| 3.364348865853659|
|1994|        Rhode Island|3.3635714000000005|
|1996|             Alabama| 3.226314057971015|
|1993|         Connecticut|3.0975461538461535|
|1990|       Massachusetts|3.0246823529411766|
|1994|           Wisconsin|2.9504833333333336|
|1993|             Indiana|2.8972258064516128|
|1995|        Rhode Island|2.7313043478260868|
|1993|            Delaware|          2.723077|
|1992|             Indiana|2.6606363636363635|
|1993|       

In [10]:
!!time python ./P3/spark_df_p3.py > ./P3/p3.result

['21/12/22 17:21:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 2:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 'real\t0m19.249s',
 'user\t0m1.215s',
 'sys\t0m0.970s']

In [11]:
!cat ./P3/p3.result

+----+--------------------+------------------+
|Year|               State|    Avg Pollutants|
+----+--------------------+------------------+
|1990|           Tennessee|170.40093066666665|
|1995|   Country Of Mexico|              8.46|
|2001|            Michigan| 4.506138716367713|
|1993|       Massachusetts| 4.305833285714285|
|2017|            Colorado|4.2250000000000005|
|1990|             Indiana| 4.098978378378379|
|1992|            Illinois| 3.911825163398692|
|1994|       Massachusetts|3.4609906122448977|
|1995|           Louisiana| 3.364348865853659|
|1994|        Rhode Island|3.3635714000000005|
|1996|             Alabama| 3.226314057971015|
|1993|         Connecticut|3.0975461538461535|
|1990|       Massachusetts|3.0246823529411766|
|1994|           Wisconsin|2.9504833333333336|
|1993|             Indiana|2.8972258064516128|
|1995|        Rhode Island|2.7313043478260868|
|1993|            Delaware|          2.723077|
|1992|             Indiana|2.66063636363

### P4

In [None]:
# %load ./P4/spark_df_p4.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    # files
    lines_states = sc.textFile('../data/usa_states.csv')
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv')

    # file mapping
    logRows_states = lines_states.filter( lambda line : len(line) > 0)    \
                    .zipWithIndex() \
                    .filter( lambda x: x[1] > 0) \
                    .map(lambda x: x[0]) \
                    .map( lambda line: line.split(',')) \
                    .map( lambda arr : Row( state = arr[0], name = arr[1], minLat = float(arr[2]), \
                                            maxLat = float(arr[3]), minLon = float(arr[4]), \
                                            maxLon = float(arr[5])))
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr: Row( name = arr[24], Lat = float(arr[5]), Lon = float(arr[6]) ) )    
    
    # Dataframe creation
    logRowsStatesDF = spark.createDataFrame( logRows_states )
    logRowsDF = spark.createDataFrame( logRows )
    logRowsDF = logRowsDF.distinct() # Makes sure we are using different monitors

    # Necessary computations to solve the problem
    finalDF = logRowsStatesDF.withColumn('center_Lat', (col('minLat')+col('maxLat'))/2 ) \
                        .withColumn('center_Lon', (col('minLon')+col('maxLon'))/2 ) \
                        .drop('minLon') \
                        .drop('maxLon') \
                        .drop('minLat') \
                        .drop('maxLat') \
                        .drop('state') \
                        .join(logRowsDF, 'name') \
                        .withColumn('distance', sqrt( pow((col('Lat')-col('center_Lat'))*111,2) + pow((col('Lon')-col('center_Lon'))*111,2) ) ) \
                        .groupBy('name').avg('distance')
    
    finalDF.show(54)

    sc.stop()
    
except Exception as err:
    print(err)
    sc.stop()

In [13]:
!!time python ./P4/spark_df_p4.py > ./P4/p4.result

['21/12/22 17:21:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 2) / 2]',
 '                                                                                ',
 '',
 '[Stage 1:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 4:>                  (0 + 4) / 4][Stage 6:>                  (0 + 2) / 2]',
 '[Stage 4:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '              

In [14]:
!cat ./P4/p4.result

+--------------+------------------+
|          name|     avg(distance)|
+--------------+------------------+
|          Utah| 184.9134304071158|
|        Hawaii|155.73279515048372|
|     Minnesota|195.06827082824813|
|          Ohio|176.18296482931532|
|      Arkansas|157.85010472330006|
|        Oregon|268.85380792326424|
|         Texas| 512.1839891630138|
|  North Dakota|248.42193073262698|
|  Pennsylvania|251.41517634057172|
|   Connecticut|  49.9897454877856|
|      Nebraska| 307.1411826055287|
|       Vermont|504.06323548508084|
|        Nevada|326.28118071973915|
|   Puerto Rico| 32.73151627576544|
|    Washington|219.98044806799825|
|      Illinois| 440.8540143212859|
|      Oklahoma|236.88257437298162|
|Virgin Islands| 73.45806087692814|
|      Delaware| 51.57977048023351|
|        Alaska| 603.6996422410687|
|    New Mexico| 183.1891212852377|
| West Virginia|144.49363881493753|
|      Missouri|234.32953412794035|
|  Rhode Island|22.192520606570366|
|

### P5

In [14]:
# %load ./P5/spark_df_p5.py
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from math import floor

spark = SparkSession.builder.master('local[*]').appName('words').getOrCreate()
sc = spark.sparkContext

try:
    # files
    lines_states = sc.textFile('../data/usa_states.csv')
    lines = sc.textFile('../data/epa_hap_daily_summary-small.csv') # Change the name of the file to what you have it named here
    
    # file mapping
    logRows_states = lines_states.filter( lambda line : len(line) > 0)    \
                    .zipWithIndex() \
                    .filter( lambda x: x[1] > 0) \
                    .map(lambda x: x[0]) \
                    .map( lambda line: line.split(',')) \
                    .map( lambda arr : Row( state = arr[0], name = arr[1], centerLat = (float(arr[2]) + float(arr[3]))/2, \
                                            centerLon = (float(arr[4]) + float(arr[5]))/2))
    logRows = lines.filter( lambda line: len(line) > 0) \
                     .zipWithIndex() \
                     .filter( lambda x: x[1] > 0) \
                     .map(lambda x: x[0]) \
                     .map( lambda line: line.split(',')) \
                     .map( lambda arr: Row( name = arr[24], countyCode = arr[1], siteNum = arr[2], lat = float("{:.3f}".format(float(arr[5]))), lon = float("{:.3f}".format(float(arr[6]))) ) )  
    
    # Creates the dataframes
    logRowsDF = spark.createDataFrame( logRows )
    
    logRows_statesDF = spark.createDataFrame( logRows_states )
    
    
    logRowsDF = logRowsDF.select('name','countyCode','siteNum','lat','lon').distinct()
    
    logRowsDF = logRowsDF.select('name','lat','lon')
    
    
    joinedDF = logRowsDF.join(logRows_statesDF,logRowsDF.name == logRows_statesDF.name,"inner" )
    
    MonitorDF = joinedDF.select((logRowsDF.name).alias('name'),\
                                (logRowsDF.lat).alias('lat'),\
                                (logRowsDF.lon).alias('lon'),\
                                (logRows_statesDF.centerLat).alias('centerLat'),\
                                (logRows_statesDF.centerLon).alias('centerLon')\
                               )
    
    #joinedDF.sort(logRowsDF.name,logRowsDF.countyCode,logRowsDF.siteNum).show(400)
    
    MonitorDF = MonitorDF.withColumn("quadrant", when((col("lat") < col("centerLat")) & (col("lon") < col("centerLon")),'NW')\
                                                 .when((col("lat") < col("centerLat")) & (col("lon") > col("centerLon")),'NE')\
                                                 .when((col("lat") > col("centerLat")) & (col("lon") < col("centerLon")),'SW')\
                                                 .when((col("lat") > col("centerLat")) & (col("lon") > col("centerLon")),'SE')
                                                 
                                    )                              
    
    FinalMonitorDF = MonitorDF.groupBy('name','quadrant').agg(count('quadrant').alias('Nr of Monitors')).sort('name', ascending = True)
    
    FinalMonitorDF.show(400)
    

    sc.stop()
except Exception as err:
    print(err)
    sc.stop()

+--------------+--------+--------------+
|          name|quadrant|Nr of Monitors|
+--------------+--------+--------------+
|       Alabama|      NE|             4|
|       Alabama|      SW|            14|
|       Alabama|      NW|             7|
|       Alabama|      SE|             4|
|        Alaska|      SE|             4|
|        Alaska|      NE|             2|
|        Alaska|      NW|             3|
|        Alaska|      SW|             3|
|       Arizona|      SW|            10|
|       Arizona|      NE|            16|
|       Arizona|      SE|             2|
|       Arizona|      NW|            10|
|      Arkansas|      NW|             5|
|      Arkansas|      NE|             1|
|      Arkansas|      SW|             3|
|      Arkansas|      SE|             2|
|    California|      SE|             2|
|    California|      NE|            68|
|    California|      NW|            15|
|    California|      SW|            84|
|      Colorado|      SE|            24|
|      Colorado|

In [16]:
!!time python ./P5/spark_df_p5.py > ./P5/p5.result

['21/12/22 17:22:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable',
 "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties",
 'Setting default log level to "WARN".',
 'To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).',
 '',
 '[Stage 0:>                                                          (0 + 2) / 2]',
 '                                                                                ',
 '',
 '[Stage 1:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 '',
 '[Stage 4:>                  (0 + 2) / 2][Stage 5:>                  (0 + 4) / 4]',
 '[Stage 5:>                                                          (0 + 4) / 4]',
 '                                                                                ',
 'real\t0m23.368s',
 

In [17]:
!cat ./P5/p5.result

+--------------+--------+--------------+
|          name|quadrant|Nr of Monitors|
+--------------+--------+--------------+
|       Alabama|      NW|             7|
|       Alabama|      NE|             5|
|       Alabama|      SE|             5|
|       Alabama|      SW|            14|
|        Alaska|      NW|             3|
|        Alaska|      SE|             4|
|        Alaska|      NE|             2|
|        Alaska|      SW|             3|
|       Arizona|      SW|            10|
|       Arizona|      SE|             2|
|       Arizona|      NE|            16|
|       Arizona|      NW|            10|
|      Arkansas|      NW|             5|
|      Arkansas|      SE|             2|
|      Arkansas|      SW|             3|
|      Arkansas|      NE|             1|
|    California|      NE|            68|
|    California|      SW|            84|
|    California|      NW|            16|
|    California|      SE|             2|
|      Colorado|      SW|         