In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lower, when, col, to_timestamp, year, rank, count, isnan, avg, sum
from pyspark.sql.window import Window
from sedona.spark import SedonaContext, ST_Point, ST_Within, ST_Distance
import time
from datetime import datetime

In [None]:
def timed(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"Execution time for {func.__name__}: {end_time - start_time:.4f} seconds")
        return result
    return wrapper

In [None]:
spark_session = SparkSession.builder \
    .appName("LosAngelesCrime") \
    .getOrCreate()

In [None]:
df = spark_session \
    .read \
    .csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/", header=True, inferSchema=True)

Cleanup

In [None]:
df = df.withColumn("Date Rptd", to_timestamp("Date Rptd", "MM/dd/yyyy hh:mm:ss a"))
df = df.withColumn("DATE OCC", to_timestamp("DATE OCC", "MM/dd/yyyy hh:mm:ss a"))
df = df.withColumnRenamed("AREA ", "AREA")

# Q1

## Dataframe

In [None]:
@timed
def q1_dataframe(df):
    df \
    .filter(df["Crm Cd Desc"].contains("AGGRAVATED ASSAULT")) \
    .withColumn(
        "Vict Age Group",
        when(col("Vict Age") < 18, "<18")
        .when((18 <= col("Vict Age")) & (col("Vict Age") <= 24), "18-24")
        .when((25 <= col("Vict Age")) & (col("Vict Age") <= 64), "25-64")
        .when(64 < col("Vict Age"), ">64")) \
    .groupBy("Vict Age Group") \
    .count() \
    .orderBy("count", ascending=False) \
    .show()

q1_dataframe(df)

## RDD

In [None]:
def age_group(row):
    if int(row['Vict Age']) < 18:
        return "<18"
    elif 18 <= int(row['Vict Age']) <= 24:
        return "18-24"
    elif 25 <= int(row['Vict Age']) <= 64:
        return "25-64"
    else:
        return ">64"

@timed
def q1_rdd(df):    
    df \
        .rdd \
        .filter(lambda x: "AGGRAVATED ASSAULT" in x["Crm Cd Desc"]) \
        .map(lambda x: (age_group(x), 1)) \
        .reduceByKey(lambda a, b: a + b) \
        .sortBy(lambda x: -x[1]) \
        .toDF(["Vict Age Group", "count"]) \
        .show()
q1_rdd(df)

In [None]:
df.select("Status Desc").distinct().show()

In [None]:
def processed_indicator(row):
    if row["Status Desc"] in ("Invest Cont", "UNK"):
        return 0
    else:
        return 1

# Q2

## α)

### RDD

In [None]:
@timed
def query2_rdd(df):
    df \
    .rdd \
    .map(lambda row: ((row["AREA NAME"], row["DATE OCC"].year), (processed_indicator(row), 1))) \
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .mapValues(lambda x: x[0] / x[1]) \
    .map(lambda x: (x[0][1], x[0][0], x[1])) \
    .toDF([
        "year",
        "precinct",
        "closed_case_rate",
    ]) \
    .withColumn("#", rank().over(Window.partitionBy("year").orderBy(col("closed_case_rate").desc()))) \
    .orderBy("year", "#") \
    .show()

In [None]:
query2_rdd(df)

### SQL

In [None]:
@timed
def query2_sql(spark_session, df):
    df.createOrReplaceTempView("crime_data")
    query = """
    SELECT
        year,
        precinct,
        closed_case_rate,
        RANK() OVER (PARTITION BY year ORDER BY closed_case_rate DESC) as `#`
    FROM (
        SELECT 
            YEAR(`DATE OCC`) as year,
            `AREA NAME` as precinct,
            COUNT(CASE WHEN `Status Desc` NOT IN ('Invest Cont', 'UNK') THEN 1 END) / COUNT(*) as closed_case_rate
        FROM crime_data
        GROUP BY `AREA NAME`, YEAR(`DATE OCC`)
    )
    """
    result = spark_session.sql(query)
    result.show()

In [None]:
query2_sql(spark_session, df)

## β)

In [None]:
df \
    .repartition(1) \
    .write \
    .mode("overwrite") \
    .parquet("s3://groups-bucket-dblab-905418150721/group3/crime_data.parquet")

In [None]:
parquet_loaded_df = spark_session \
    .read \
    .parquet("s3://groups-bucket-dblab-905418150721/group3/crime_data.parquet")

In [None]:
parquet_loaded_df = parquet_loaded_df.withColumn("Date Rptd", to_timestamp("Date Rptd", "MM/dd/yyyy hh:mm:ss a"))
parquet_loaded_df = parquet_loaded_df.withColumn("DATE OCC", to_timestamp("DATE OCC", "MM/dd/yyyy hh:mm:ss a"))

In [None]:
query2_rdd(parquet_loaded_df)

# Q3

In [None]:
@timed
def q3_sql(spark_session, income_df, census_df):
    income_df

In [None]:
income_df = spark_session \
    .read \
    .csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)

In [None]:
income_df.show()

In [None]:
census_block_fields = spark_session \
    .read \
    .csv("s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks_fields.csv", header=True, inferSchema=True)

In [None]:
census_block_fields.show(n=1000)

In [None]:
sedona = SedonaContext.create(spark_session)
census_blocks_df = sedona \
    .read \
    .format("geojson") \
    .option("multiline", "true") \
    .load("s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson") \
    .selectExpr("explode(features) as features") \
    .select("features.*")
census_blocks_df = census_blocks_df.select([col(f"properties.{col_name}").alias(col_name) for col_name in census_blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
    .drop("properties") \
    .drop("type")
census_blocks_df.printSchema()

In [None]:
comms = set(map(lambda row: row["COMM"], census_blocks_df.select("COMM").distinct().collect()))

In [None]:
census_zip_codes = set(map(lambda row: row["ZCTA10"], census_blocks_df.select("ZCTA10").distinct().collect()))

In [None]:
income_zip_codes = set(map(lambda row: row["Zip Code"], income_df.select("Zip Code").distinct().collect()))

In [None]:
income_zip_codes - census_zip_codes

In [None]:
len(census_zip_codes - income_zip_codes)

In [None]:
df.printSchema()

# Q3

In [None]:
crime_comms = set(map(lambda row: row["AREA NAME"], df.select("AREA NAME").distinct().collect()))

In [None]:
crime_comms

In [None]:
census_blocks_df.select("geometry").distinct().show()

In [None]:
@timed
def query_3_dataframe(crime_data_df, census_blocks_df, income_df):
    crime_data_df = crime_data_df.withColumn("geom", ST_Point("LON", "LAT"))
    joined_df = crime_data_df \
        .join(census_blocks_df, ST_Within(crime_data_df["geom"], census_blocks_df["geometry"])) \
        .groupBy("COMM") \
        .count() \
        .withColumnRenamed("count", "crime_count")
    comm_population = census_blocks_df.groupBy("COMM").agg(sum("POP_2010").alias("population"))

In [None]:
query_3_dataframe(df, census_blocks_df, income_df)

In [None]:
df.printSchema()

In [None]:
df.select("LOCATION").distinct().show()

In [None]:
census_blocks_df.filter(isnan(col("POP_2010"))).count()

---

## Q4 : Racial Profile Analysis

In [None]:
from pyspark.sql import SparkConf, SparkContext, SparkSession
from pyspark.sql.functions import col, count, when, avg
import time

# Define timed decorator
def timed(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time for {func.__name__}: {execution_time:.4f} seconds")
        return result, execution_time
    return wrapper

def log_progress(message):
    print(f"[INFO] {message}")
    
# Define configurations
configurations = [
    {"app_name": "Q4_CONFIG_1", "num_executors": 2, "executor_cores": 4, "executor_memory": "8G"},
    {"app_name": "Q4_CONFIG_2", "num_executors": 4, "executor_cores": 2, "executor_memory": "4G"},
    {"app_name": "Q4_CONFIG_3", "num_executors": 8, "executor_cores": 1, "executor_memory": "2G"},
]

# Initialize a shared SparkContext
conf = SparkConf().setAppName("SharedSparkContext").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)

@timed
def compute_racial_profile(query3_result_df, crime_2015_df, race_codes_df, spark_session):
    log_progress("Running Query 4")

    # Top 3 and bottom 3 areas by income
    top_3_areas = query3_result_df.orderBy(col("income_per_person").desc()).limit(3)
    bottom_3_areas = query3_result_df.orderBy(col("income_per_person").asc()).limit(3)

    # Join crime data with top and bottom 3 areas
    crimes_top3 = crime_2015_df.join(top_3_areas, "COMM", "inner")
    crimes_bottom3 = crime_2015_df.join(bottom_3_areas, "COMM", "inner")

    # Map racial codes for top 3
    crimes_top3 = crimes_top3.join(race_codes_df, crimes_top3["Vict Descent"] == race_codes_df["DescentCode"], "left")

    # Map racial codes for bottom 3
    crimes_bottom3 = crimes_bottom3.join(race_codes_df, crimes_bottom3["Vict Descent"] == race_codes_df["DescentCode"], "left")

    # Group and sort for top and bottom 3
    top3_racial_profile = crimes_top3.groupBy("FullDescription").agg(count("*").alias("victim_count")).orderBy(col("victim_count").desc())
    bottom3_racial_profile = crimes_bottom3.groupBy("FullDescription").agg(count("*").alias("victim_count")).orderBy(col("victim_count").desc())

    # Return results
    return top3_racial_profile, bottom3_racial_profile

@timed
def run_query4_benchmark(query3_result_df, crime_2015_df, race_codes_df):
    benchmark_results = []
    for config in configurations:
        log_progress(f"Starting with SparkSession: {config['app_name']}")

        # Create a new SparkSession using the shared SparkContext and apply resource configurations
        spark_session = (
            SparkSession(sc)
            .newSession()
            .builder
            .appName(config["app_name"])
            .config("spark.executor.instances", config["num_executors"])
            .config("spark.executor.cores", config["executor_cores"])
            .config("spark.executor.memory", config["executor_memory"])
            .getOrCreate()
        )

        # Compute racial profile with the current configuration
        (top3_racial_profile, bottom3_racial_profile), execution_time = compute_racial_profile(query3_result_df, crime_2015_df, race_codes_df, spark_session)
        benchmark_results.append({"config": config, "execution_time": execution_time})

        # Display results for each configuration
        print(f"=== Results for Config: {config['app_name']} ===")
        print("== Racial profile for top 3 high-income areas ==")
        top3_racial_profile.show()

        print("== Racial profile for bottom 3 low-income areas ==")
        bottom3_racial_profile.show()

    # Display benchmark results
    log_progress("Benchmark Results:")
    for result in benchmark_results:
        print(f"Config: {result['config']}, Execution Time: {result['execution_time']:.2f} seconds")

In [None]:
# Example usage
run_query4_benchmark(query3_result_df, crime_2015_df, race_codes_df)

---

## Q5: Crime Proximity Analysis

This query computes the closest police station for each crime, aggregates the results by station, and displays statistics such as the total number of crimes and the average distance.

#### Workflow
1. Loads police station data and crime data.
2. Filters valid crime records.
3. Computes distances between crimes and police stations using Apache Sedona.
4. Finds the closest station for each crime using a window function.
5. Aggregates crimes by station and calculates average distances.

#### Output
- `division`: Police station name.
- `average_distance`: Average distance of crimes from the station.
- `crime_count`: Total crimes associated with each station.

The results are displayed in `descending` order of crime count.

### Q5 Definition

In [1]:
from pyspark.sql.functions import col, count, avg, rank
from pyspark.sql import Window
from sedona.spark import SedonaContext, ST_Point, ST_Distance
import time

# Utility Functions
def log_progress(message):
    print(f"[INFO] {message}")

def timed(func):
    """
    Decorator to measure execution time of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"\nExecution time for {func.__name__}: {execution_time:.4f} seconds\n")
        return result, execution_time
    return wrapper

@timed
def run_query_5(spark, crime_df):
    """
    Executes Query 5: Computes the closest police station for each crime, aggregates
    results by station, and returns the final DataFrame.

    Parameters:
    - spark: The active Spark session.
    - crime_df: The crime DataFrame.

    Returns:
    - query5_result_df: A DataFrame containing aggregated results.
    """
    log_progress("Initializing Sedona context")
    sedona = SedonaContext.create(spark)

    # Load police station data
    log_progress("Loading police station data")
    police_stations_df = spark.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv",
        header=True, inferSchema=True
    )
    police_stations_df = police_stations_df.withColumn("station_geom", ST_Point("X", "Y"))

    # Clean crime data
    log_progress("Filtering valid crime data")
    crime_df = crime_df.filter((col("LAT").isNotNull()) & (col("LON").isNotNull()))
    crime_df = crime_df.withColumn("crime_geom", ST_Point("LON", "LAT"))

    # Cross join crimes with police stations to compute distances
    log_progress("Computing distances between crimes and police stations")
    distances_df = crime_df.crossJoin(police_stations_df) \
        .withColumn("distance", ST_Distance(col("crime_geom"), col("station_geom")))

    # Find the closest police station for each crime
    log_progress("Finding the closest police station for each crime")
    window_spec = Window.partitionBy("DR_NO").orderBy(col("distance"))
    closest_stations_df = distances_df.withColumn("rank", rank().over(window_spec)).filter(col("rank") == 1)

    # Aggregate results by police station
    log_progress("Aggregating results by police station")
    query5_result_df = closest_stations_df.groupBy("DIVISION") \
        .agg(
            count("DR_NO").alias("crime_count"),
            avg("distance").alias("average_distance")
        ) \
        .orderBy(col("crime_count").desc())

    # Display results
    log_progress("Displaying Query 5 results")
    query5_result_df.select(
        col("DIVISION").alias("division"),
        col("average_distance"),
        col("crime_count").alias("#")
    ).orderBy(col("#").desc()).show()

    return query5_result_df

Starting Spark application


The code failed because of a fatal error:
	Session 4260 did not start up in 60 seconds..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


### One-by-One execution

Modify the configuration object to test different Spark resource settings for executors, cores, and memory.

#### Configuration Object

```python
config = {
    "num_executors": 8,
    "executor_cores": 1,
    "executor_memory": "2G"
}
```

In [2]:
from pyspark.sql import SparkSession

# Spark configuration : Change the object below to test different configs.
config = {
    "num_executors": 8,
    "executor_cores": 1,
    "executor_memory": "2G"
}

spark_session = (
    SparkSession.builder
    .appName("Q5")
    .config("spark.executor.instances", config["num_executors"])
    .config("spark.executor.cores", config["executor_cores"])
    .config("spark.executor.memory", config["executor_memory"])
    .getOrCreate()
)
log_progress(f"Starting with configuration: {config}")

crime_df = spark_session.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/",
    header=True,
    inferSchema=True
)
log_progress("/CrimeData loaded successfully.")

# Execute Query 5
query5_result_df, execution_time = run_query_5(spark_session, crime_df)

The code failed because of a fatal error:
	Session 4260 did not start up in 60 seconds..

Some things to try:
a) Make sure Spark has enough available resources for Jupyter to create a Spark context.
b) Contact your Jupyter administrator to make sure the Spark magics library is configured correctly.
c) Restart the kernel.


### Dynamic SparkSession Configuration

This script benchmarks Query 5 execution under three different Spark configurations by creating new Spark sessions for each setup.

#### Workflow
1. Initializes a shared `SparkContext`.
2. Dynamically creates a `SparkSession` for each configuration (`executors`, `cores`, and `memory`).
3. Loads the crime dataset.
4. Executes Query 5 and measures execution time for each configuration.

#### Configurations
1. Config 1: 2 executors, 4 cores, 8 GB memory.
2. Config 2: 4 executors, 2 cores, 4 GB memory.
3. Config 3: 8 executors, 1 core, 2 GB memory.

#### Output
For each configuration:
- Displays Query 5 results.
- Logs execution time.


In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

# Initialize a shared SparkContext
conf = SparkConf().setAppName("SharedSparkContext").setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)

# Workflow for each configuration
configs = [
    {"app_name": "Q5_CONFIG_1", "num_executors": 2, "executor_cores": 4, "executor_memory": "8G"},
    {"app_name": "Q5_CONFIG_2", "num_executors": 4, "executor_cores": 2, "executor_memory": "4G"},
    {"app_name": "Q5_CONFIG_3", "num_executors": 8, "executor_cores": 1, "executor_memory": "2G"},
]

for config in configs:
    log_progress(f"Starting with SparkSession: {config['app_name']}")
    
    # Create a new SparkSession using the shared SparkContext and apply resource configurations
    spark_session = (
        SparkSession(sc)
        .newSession()
        .builder
        .appName(config["app_name"])
        .config("spark.executor.instances", config["num_executors"])
        .config("spark.executor.cores", config["executor_cores"])
        .config("spark.executor.memory", config["executor_memory"])
        .getOrCreate()
    )
    
    # Load the crime dataset
    crime_df = spark_session.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/",
        header=True,
        inferSchema=True
    )
    log_progress("/CrimeData loaded successfully.")
    
    # Execute Query 5 with timing
    query5_result_df, execution_time = run_query_5(spark_session, crime_df)
    print(f"Query 5 execution time for {config['app_name']}: {execution_time:.2f} seconds")


Starting Spark application




## WIP

In [None]:
from pyspark.sql import SparkSession
from concurrent.futures import ThreadPoolExecutor
import time

# Utility Functions
def log_progress(message):
    print(f"[INFO] {message}")

def timed(func):
    """
    Decorator to measure execution time of a function.
    """
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"\nExecution time for {func.__name__}: {execution_time:.4f} seconds\n")
        return result, execution_time
    return wrapper

@timed
def run_query_5(config):
    """
    Runs Query 5 for a given configuration.
    Parameters:
    - config: Dictionary containing Spark session settings.
    """
    log_progress(f"Starting with SparkSession: {config['app_name']}")

    # Create SparkSession with the configuration
    spark_session = (
        SparkSession.builder
        .appName(config["app_name"])
        .config("spark.executor.instances", config["num_executors"])
        .config("spark.executor.cores", config["executor_cores"])
        .config("spark.executor.memory", config["executor_memory"])
        .getOrCreate()
    )

    # Load the crime dataset
    log_progress("Loading crime dataset...")
    crime_df = spark_session.read.csv(
        "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/",
        header=True,
        inferSchema=True
    )
    log_progress("Crime data loaded successfully.")

    # Simulate Query 5 execution (replace this with your query logic)
    log_progress(f"Executing Query 5 for {config['app_name']}...")
    time.sleep(2)  # Simulated execution time
    log_progress(f"Query 5 completed for {config['app_name']}.")

    # Stop the SparkSession
    spark_session.stop()
    log_progress(f"Stopped SparkSession for {config['app_name']}.")

# Configurations for each SparkSession
configs = [
    {"app_name": "Q5_CONFIG_1", "num_executors": 2, "executor_cores": 4, "executor_memory": "8G"},
    {"app_name": "Q5_CONFIG_2", "num_executors": 4, "executor_cores": 2, "executor_memory": "4G"},
    {"app_name": "Q5_CONFIG_3", "num_executors": 8, "executor_cores": 1, "executor_memory": "2G"},
]

# Run configurations in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=len(configs)) as executor:
    futures = [executor.submit(run_query_5, config) for config in configs]
    for future in futures:
        future.result()
