In [2]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [3]:
credentials_location = '/workspaces/de-eq-asmnt-2024/google_credentials/de-eq-asmnt-2024-6ee51b1c99e1.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [4]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

24/04/17 00:56:41 WARN Utils: Your hostname, codespaces-ff0b25 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/04/17 00:56:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/17 00:56:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [95]:
df = spark.read.csv('eq_events/*/*', header='true')

                                                                                

In [98]:
df = df.drop('_c0')

In [100]:
df.registerTempTable('eq_events')

In [120]:
df_final = spark.sql("""
select *
from eq_events
cluster by country
""")

In [116]:
df_monthly = spark.sql("""
select 
    country, 
    date_trunc('month', date) as eq_month,
    COUNT(event_id) as events_occ,
    avg(depth) as avg_depth_month,
    avg(magnitude) as avg_mag_month,
    avg(significance) as avg_sig_month,
    max(depth) as max_depth_month,
    max(magnitude) as max_mag_month,
    max(significance) as max_sig_month,
    min(depth) as min_depth_month,
    min(magnitude) as min_mag_month,
    min(significance) as min_sig_month
from eq_events
group by 1,2
cluster by country
""")

In [117]:
df_week = spark.sql("""
select 
    country, 
    date_trunc('week', date) as eq_week,
    COUNT(event_id) as events_occ,
    avg(depth) as avg_depth_week,
    avg(magnitude) as avg_mag_week,
    avg(significance) as avg_sig_week,
    max(depth) as max_depth_week,
    max(magnitude) as max_mag_week,
    max(significance) as max_sig_week,
    min(depth) as min_depth_week,
    min(magnitude) as min_mag_week,
    min(significance) as min_sig_week
from eq_events
group by 1,2
cluster by country
""")

In [118]:
df_daily = spark.sql("""
select 
    country, 
    date,
    COUNT(event_id) as events_occ,
    avg(depth) as avg_depth_day,
    avg(magnitude) as avg_mag_day,
    avg(significance) as avg_sig_day,
    max(depth) as max_depth_day,
    max(magnitude) as max_mag_day,
    max(significance) as max_sig_day,
    min(depth) as min_depth_day,
    min(magnitude) as min_mag_day,
    min(significance) as min_mag_day
from eq_events
group by 1,2 
cluster by country
""")

In [121]:
df_final.show()



+----------+--------------------+--------------------+--------+---------+-----+---------+------------+-----+-------+----------+---------------+--------+
|  event_id|            datetime|            location|latitude|longitude|depth|magnitude|significance|alert|country|      date|      timestamp|   level|
+----------+--------------------+--------------------+--------+---------+-----+---------+------------+-----+-------+----------+---------------+--------+
|usp000ennh|2006-07-15 07:36:...|76 km NNE of Punt...|   12.33|   -69.92| 25.8|      4.4|         298|  nan|    ABW|2006-07-15|07:36:21.500000|   light|
|usp000g320|2008-04-03 19:07:...|64 km NNE of Punt...|  12.231|  -69.971| 10.8|      4.0|         246|  nan|    ABW|2008-04-03|19:07:55.890000|   light|
|usc000n5bb|2014-03-07 17:31:...|67 km NNE of Punt...|   12.27|    -70.0| 23.6|      3.0|         139|  nan|    ABW|2014-03-07|17:31:41.200000|   minor|
|usp000j74j|2011-08-27 16:15:...|101 km NNE of Pun...|   12.48|   -69.72| 15.0|   

                                                                                

In [124]:
df.coalesce(1).write.option("header", "true").parquet('eq_events/final/', mode='overwrite')

                                                                                

In [122]:
df_final.write.option("header", "true").partitionBy('date').parquet('eq_events/final/', mode='overwrite')

ERROR:root:KeyboardInterrupt while sending command.                 (0 + 2) / 3]
Traceback (most recent call last):
  File "/usr/local/python/3.10.13/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/python/3.10.13/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/local/python/3.10.13/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
df_final.write.option("header", "true").partitionBy('date').parquet('eq_events/final/', mode='overwrite')

In [None]:
df.coalesce(1).write.parquet('eq_events/final/', mode='overwrite')

In [None]:
df.coalesce(1).write.parquet('eq_events/final/', mode='overwrite')