In [None]:
import os

# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.5.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Import dependencies
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

### Extract the crime Data

In [None]:
# Read the data into a DataFrame
from pyspark import SparkFiles
crimes_df = spark.read.csv(SparkFiles.get("/content/Crime_Data_from_2020_to_Present_20240611.csv"), sep=",", header=True)
crimes_df.show()





In [None]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType

# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [None]:
# Rename Columns
df = crimes_df.withColumnRenamed("AREA NAME",
                                  "District").withColumnRenamed("Crm Cd Desc",
                                                          "Crimes")



In [None]:
df.show()

In [None]:
#Create temp view
df.createOrReplaceTempView('crimesdata')

In [None]:
# Get highset crime Area/District
df_sum = spark.sql("SELECT District, count(*) as crimecount FROM crimesdata GROUP BY District Order by crimecount desc")
df_sum.show()

In [None]:
df.printSchema()

In [None]:
# Get highest Crime type
start_time = time.time()
df_sum_area = spark.sql("SELECT  Crimes, count(*) as crimecount FROM crimesdata GROUP BY  Crimes Order by  crimecount desc")
df_sum_area.show()

print("--- %s seconds ---" % (time.time() - start_time))