In [0]:
## Load Clean Table 
df = spark.table("crime_final_cluster.default.clean_crime")
df.show(5)
df.printSchema()


+---------+-------------+--------+--------+----+---------+-----------+--------+------+--------------------+-------------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--------+--------+--------+--------+--------------------+------------+-------+---------+--------+--------+---------+---------------+-----------+
|    DR_NO|date_reported|date_occ|time_occ|AREA|area_name|rpt_dist_no|part_1_2|crm_cd|         crm_cd_desc|            Mocodes|vict_age|vict_sex|vict_descent|premis_cd|         premis_desc|weapon_used_cd|weapon_desc|Status| status_desc|crm_cd_1|crm_cd_2|crm_cd_3|crm_cd_4|            LOCATION|cross_street|    LAT|      LON|occ_hour|occ_year|occ_month|occ_day_of_week|  age_group|
+---------+-------------+--------+--------+----+---------+-----------+--------+------+--------------------+-------------------+--------+--------+------------+---------+--------------------+--------------+-----------+------+------------+--

In [0]:
## Crime per Year
from pyspark.sql.functions import count

df_by_year = (
    df.groupBy("occ_year")
      .agg(count("*").alias("total_crimes"))
      .orderBy("occ_year")
)

df_by_year.show()


+--------+------------+
|occ_year|total_crimes|
+--------+------------+
|    NULL|      955339|
+--------+------------+



In [0]:
## Top 10 Crime Types
df_top_crimes = (
    df.groupBy("crm_cd_desc")
      .agg(count("*").alias("crime_count"))
      .orderBy("crime_count", ascending=False)
      .limit(10)
)

df_top_crimes.show()


+--------------------+-----------+
|         crm_cd_desc|crime_count|
+--------------------+-----------+
|    VEHICLE - STOLEN|     104632|
|BATTERY - SIMPLE ...|      74583|
|BURGLARY FROM VEH...|      59155|
|   THEFT OF IDENTITY|      59054|
|VANDALISM - FELON...|      57929|
|            BURGLARY|      57573|
|ASSAULT WITH DEAD...|      53271|
|THEFT PLAIN - PET...|      49065|
|INTIMATE PARTNER ...|      46657|
|THEFT FROM MOTOR ...|      37525|
+--------------------+-----------+



In [0]:
## Crime by Area_Name
df_by_area = (
    df.groupBy("area_name")
      .agg(count("*").alias("crime_count"))
      .orderBy("crime_count", ascending=False)
)

df_by_area.show()


+-----------+-----------+
|  area_name|crime_count|
+-----------+-----------+
|    Central|      64737|
|77th Street|      59719|
|    Pacific|      55792|
|  Southwest|      54129|
|  Hollywood|      50035|
|N Hollywood|      48556|
|  Southeast|      48127|
|    Olympic|      47840|
|     Newton|      47125|
|   Wilshire|      45734|
|    Rampart|      44440|
|    West LA|      43509|
|   Van Nuys|      40830|
|  Northeast|      40661|
|West Valley|      40400|
| Devonshire|      39603|
|     Harbor|      39485|
|    Topanga|      39403|
|    Mission|      38244|
| Hollenbeck|      35156|
+-----------+-----------+
only showing top 20 rows


In [0]:
## Crimes by Age Group
df_by_age_group = (
    df.groupBy("age_group")
      .agg(count("*").alias("crime_count"))
      .orderBy("age_group")
)

df_by_age_group.show()


+-----------+-----------+
|  age_group|crime_count|
+-----------+-----------+
|      Adult|     312982|
|      Child|     270839|
|Middle-aged|     152997|
|     Senior|      30714|
|    Unknown|        124|
|Young Adult|     187683|
+-----------+-----------+



In [0]:
%sql
SELECT
  occ_year,
  COUNT(*) AS total_crimes
FROM crime_final_cluster.default.clean_crime
GROUP BY occ_year
ORDER BY occ_year

occ_year,total_crimes
,955339


In [0]:
%sql
SELECT
  area_name,
  COUNT(*) AS total_crimes
FROM crime_final_cluster.default.clean_crime
GROUP BY area_name
ORDER BY total_crimes DESC

area_name,total_crimes
Central,64737
77th Street,59719
Pacific,55792
Southwest,54129
Hollywood,50035
N Hollywood,48556
Southeast,48127
Olympic,47840
Newton,47125
Wilshire,45734
