In [1]:
import warnings
warnings.filterwarnings('ignore')
import json
from pyspark import SparkContext
from datetime import datetime, timedelta
from haversine import haversine, Unit

In [3]:
sc = SparkContext.getOrCreate()
data_rdd = sc.textFile("/Users/halilergul/Desktop/master/fall-23_24/datasets-20231023/Earthquakes.txt") 
header = data_rdd.first()
lines = data_rdd.filter(lambda line: line != header)

def lineparser(line):
    fields = line.split("\t")
    event_id = fields[1]
    date = fields[2]
    year = int(date.split(".")[0])
    latitude = float(fields[4])
    longitude = float(fields[5])
    power = float(fields[8]) 
    return (year, date, latitude, longitude, power, event_id)

parsed_data = lines.map(lineparser)
filtered_earthquakes = parsed_data.filter(lambda x: 1990 <= x[0] <= 2019) # this is to filter out the earthquakes that are not in the range of 1990-2019
earthquakes = filtered_earthquakes.top(10, key=lambda x: x[4]) # so I will get top 10 earthquakes with the highest magnitude

def retrieve_shocks(target_quake): #it takes a tuple of (year, date, lat, lon, mag, event_id)
    target_date = datetime.strptime(target_quake[1], "%Y.%m.%d")
    target_coords = (target_quake[2], target_quake[3])
    def time_and_date_checker(x, compare_date): # this function checks if the earthquake is within 20km and 1 day of the target earthquake
        event_date = datetime.strptime(x[1], "%Y.%m.%d")
        time_difference = (event_date - compare_date).total_seconds()
        distance = haversine(target_coords, (x[2], x[3]), unit=Unit.KILOMETERS)
        return 0 <= abs(time_difference) <= 86400 and distance <= 20 and x != target_quake # x != target_quake is to exclude the target earthquake itself
    foreshocks = filtered_earthquakes.filter(lambda x: time_and_date_checker(x, target_date) 
                                                and (target_date - datetime.strptime(x[1], "%Y.%m.%d")).total_seconds() >= 0).collect()
    aftershocks = filtered_earthquakes.filter(lambda x: time_and_date_checker(x, target_date) 
                                                 and (datetime.strptime(x[1], "%Y.%m.%d") - target_date).total_seconds() >= 0).collect()
    return foreshocks, aftershocks

results = {}
for quake in earthquakes:
    foreshocks, aftershocks = retrieve_shocks(quake)
    valid_foreshocks = [{"Event ID": s[5], "Date": s[1], "Magnitude": s[4], "Location": (s[2], s[3])} for s in foreshocks]
    valid_aftershocks = [{"Event ID": s[5], "Date": s[1], "Magnitude": s[4], "Location": (s[2], s[3])} for s in aftershocks]
    results[quake[5]] = {
        "foreshocks": valid_foreshocks,
        "aftershocks": valid_aftershocks
    }
results_json = json.dumps(results, indent=4) # I wanted to store the results in a json file.
print(results_json) # This json contains the top 10 earthquakes with the highest magnitude and their foreshocks and aftershocks. 
# keys are the event ids of the earthquakes and values are the foreshocks and aftershocks of the earthquakes.

{
    "19990817000137": {
        "foreshocks": [
            {
                "Event ID": "19990817151752",
                "Date": "1999.08.17",
                "Magnitude": 4.1,
                "Location": [
                    40.75,
                    29.75
                ]
            }
        ],
        "aftershocks": [
            {
                "Event ID": "19990818211738",
                "Date": "1999.08.18",
                "Magnitude": 4.0,
                "Location": [
                    40.86,
                    30.04
                ]
            },
            {
                "Event ID": "19990817151752",
                "Date": "1999.08.17",
                "Magnitude": 4.1,
                "Location": [
                    40.75,
                    29.75
                ]
            }
        ]
    },
    "19980627135551": {
        "foreshocks": [
            {
                "Event ID": "19980627205016",
                "Date": "1998.06.27",
         