In [137]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import date_format

In [3]:
credentials_location = '/workspaces/de-eq-asmnt-2024/google_credentials/de-eq-asmnt-2024-6ee51b1c99e1.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "./lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [4]:
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

24/04/17 00:56:41 WARN Utils: Your hostname, codespaces-ff0b25 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/04/17 00:56:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
24/04/17 00:56:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [5]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [168]:
df = spark.read.csv('eq_events/raw/*/*', header='true')

                                                                                

In [170]:
df = df.withColumn("year", date_format(df.date, "yyyy")).withColumn("month", date_format(df.date, "MM"))

In [171]:
df.show()

24/04/17 02:36:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , event_id, datetime, location, latitude, longitude, depth, magnitude, significance, alert, country, date, timestamp, level
 Schema: _c0, event_id, datetime, location, latitude, longitude, depth, magnitude, significance, alert, country, date, timestamp, level
Expected: _c0 but found: 
CSV file: file:/workspaces/de-eq-asmnt-2024/data/eq_events/raw/eq_events_2005/eq_events_2005_10.csv
+------+----------+--------------------+--------------------+--------+------------+------+---------+------------+-----+-------+----------+---------------+-----+----+-----+
|   _c0|  event_id|            datetime|            location|latitude|   longitude| depth|magnitude|significance|alert|country|      date|      timestamp|level|year|month|
+------+----------+--------------------+--------------------+--------+------------+------+---------+------------+-----+-------+----------+---------------+-----+----+-----+
|5649

In [172]:
df = df.drop('_c0')

In [173]:
df.registerTempTable('eq_events')

In [174]:
df_final = spark.sql("""
select *
from eq_events
cluster by country
""")

In [175]:
df_monthly = spark.sql("""
select 
    country, 
    date_trunc('month', date) as eq_month,
    COUNT(event_id) as events_occ,
    avg(depth) as avg_depth_month,
    avg(magnitude) as avg_mag_month,
    avg(significance) as avg_sig_month,
    max(depth) as max_depth_month,
    max(magnitude) as max_mag_month,
    max(significance) as max_sig_month,
    min(depth) as min_depth_month,
    min(magnitude) as min_mag_month,
    min(significance) as min_sig_month
from eq_events
group by 1,2
cluster by country
""")

In [176]:
df_week = spark.sql("""
select 
    country, 
    date_trunc('week', date) as eq_week,
    COUNT(event_id) as events_occ,
    avg(depth) as avg_depth_week,
    avg(magnitude) as avg_mag_week,
    avg(significance) as avg_sig_week,
    max(depth) as max_depth_week,
    max(magnitude) as max_mag_week,
    max(significance) as max_sig_week,
    min(depth) as min_depth_week,
    min(magnitude) as min_mag_week,
    min(significance) as min_sig_week
from eq_events
group by 1,2
cluster by country
""")

In [177]:
df_daily = spark.sql("""
select 
    country, 
    date,
    COUNT(event_id) as events_occ,
    avg(depth) as avg_depth_day,
    avg(magnitude) as avg_mag_day,
    avg(significance) as avg_sig_day,
    max(depth) as max_depth_day,
    max(magnitude) as max_mag_day,
    max(significance) as max_sig_day,
    min(depth) as min_depth_day,
    min(magnitude) as min_mag_day,
    min(significance) as min_sig_day
from eq_events
group by 1,2 
cluster by country
""")

In [179]:
df_monthly.count()

                                                                                

34963

In [124]:
df.coalesce(1).write.option("header", "true").parquet('eq_events/final/', mode='overwrite')

                                                                                

In [180]:
df.coalesce(1).write.option("header", "true").partitionBy('year', 'month').parquet('eq_events/processed/final', mode='overwrite')

                                                                                

In [None]:
df_week.coalesce(1).write.option("header", "true").partitionBy('eq_week').parquet('eq_events/processed/weekly/', mode='overwrite')

[Stage 161:>                                                        (0 + 1) / 1]

In [None]:
df_monthly.coalesce(1).write.option("header", "true").partitionBy('eq_month').parquet('eq_events/processed/monthly/', mode='overwrite')

In [None]:
df_daily.coalesce(1).write.option("header", "true").partitionBy('year', 'month').parquet('eq_events/processed/daily/', mode='overwrite')