# Crime near Heritage Sites
There are approximately 2200 structures listed as  heritage sites within the Vancouver city limits. In this notebook, we attempt to take a close look at how much crime is committed in the vicinity of these sites.

In [4]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [5]:
#Create Spark Session and context
spark = SparkSession\
    .builder\
    .appName("example code")\
    .config("spark.driver.extraClassPath","/home/jim/spark-2.4.0-bin-hadoop2.7/jars/mysql-connector-java-5.1.49.jar")\
    .getOrCreate()
spark.sparkContext.setLogLevel('WARN')
sc = spark.sparkContext

In [7]:
heritage = spark.read.format("csv").option("header", "true").load("../Data/heritage_sites.csv")
heritage.show(10,truncate=False)
print('The Heritage Sites dataset has {} rows'.format(heritage.count()))

+---+---------------------+------------------+----------+------------------+----+-----------+--------------------+--------------------+----------+-------+----------------+-----------------+
|ID |Address              |BuildingName      |Street_Nbr|Street            |VHR |Designation|Zoning              |LocalArea           |Land_Coord|HRA_HCC|Latitude        |Longitude        |
+---+---------------------+------------------+----------+------------------+----+-----------+--------------------+--------------------+----------+-------+----------------+-----------------+
|1  |36 BLOOD ALLEY SQUARE|null              |36        |BLOOD ALLEY SQUARE|B   |M          |HA-2                |CENTRL BUSNES/DWNTWN|58917265  |null   |49.2828738098464|-123.105225398754|
|2  |2723 WOODLAND DRIVE  |null              |2723      |WOODLAND DRIVE    |B   |null       |RT-5                |KENSINGTON-CEDAR COT|23466523  |null   |49.2600769493209|-123.074046917561|
|3  |2080 W 36TH AV       |null              |2080

In [8]:
heritage.columns

['ID',
 'Address',
 'BuildingName',
 'Street_Nbr',
 'Street',
 'VHR',
 'Designation',
 'Zoning',
 'LocalArea',
 'Land_Coord',
 'HRA_HCC',
 'Latitude',
 'Longitude']

#### We shall clean the dataset a little now

In [14]:
#Retain only necessary columns
heritage = heritage.select('ID','BuildingName','LocalArea','Latitude','Longitude')
#Rewrite null values in BuildingName
heritage = heritage.na.fill("Name Unknown", 'BuildingName')
heritage = heritage.withColumn("BuildingName", trim(heritage.BuildingName))
heritage = heritage.withColumn("LocalArea", trim(heritage.LocalArea))
heritage.show(10,truncate=False)

+---+---------------+--------------------+----------------+-----------------+
|ID |BuildingName   |LocalArea           |Latitude        |Longitude        |
+---+---------------+--------------------+----------------+-----------------+
|1  |Name Unknown   |CENTRL BUSNES/DWNTWN|49.2828738098464|-123.105225398754|
|2  |Name Unknown   |KENSINGTON-CEDAR COT|49.2600769493209|-123.074046917561|
|3  |Name Unknown   |SHAUGHNESSY         |49.2389033094729|-123.154237368533|
|4  |Name Unknown   |MOUNT PLEASANT      |49.2621356773046|-123.085603327158|
|5  |Name Unknown   |FAIRVIEW            |49.261948887911 |-123.136713198289|
|6  |Name Unknown   |KITSILANO           |49.2703621626003|-123.165116703005|
|7  |Name Unknown   |WEST POINT GREY     |49.2606925359374|-123.200353485935|
|8  |INFILL DWELLING|STRATHCONA          |49.2786667720091|-123.090587526348|
|9  |Name Unknown   |CENTRL BUSNES/DWNTWN|49.2822373252557|-123.104605551032|
|10 |Name Unknown   |CENTRL BUSNES/DWNTWN|49.2821072095572|-123.

## Now we will load the dataset of crimes that is our main source of crime data

In [31]:
crime_df = spark.read.format("csv").option("header", "true").load("..//Data/crime/crime_all_years_latlong.csv")
#Drop unrequired columns
crime_df = crime_df.select(['TYPE','NEIGHBOURHOOD','LATITUDE','LONGITUDE'])
crime_df = crime_df.dropna(how='any')
crime_df.show(10,truncate=True)
print("Crime Dataset has {} rows".format(crime_df.count()))

+--------------------+--------------------+------------------+-------------------+
|                TYPE|       NEIGHBOURHOOD|          LATITUDE|          LONGITUDE|
+--------------------+--------------------+------------------+-------------------+
|            Mischief|              Sunset| 49.22285547453633|-123.10457767461014|
|    Theft of Vehicle| Victoria-Fraserview| 49.21942208176436|-123.05928356709362|
|Break and Enter C...|Central Business ...|49.280454355702865|-123.10100566349294|
|            Mischief|            West End| 49.29261448054877|-123.13962081805273|
|            Mischief|            West End| 49.29260865723727|-123.13945233120421|
|            Mischief|    Hastings-Sunrise|49.281126361961825| -123.0554729922974|
|  Theft from Vehicle|      Mount Pleasant|49.263002922167225|-123.10655743565438|
|            Mischief|    Hastings-Sunrise| 49.28112610578195|-123.05525671257254|
|  Theft from Vehicle|           Kitsilano| 49.25958751890934| -123.1707943860336|
|  T

### We must now get this dataset into a proper format so as it can be meaningfully joined to the Heritae Site data
Upon merging the Hundred_BLOCK and Neighbourhood values as a common column, we can join it on the streetlight dataset (after using the same transformation on it) to sufficiently narrow down street nights in each 10-block radius and associate crime in the area with it

In [32]:
crime_df = crime_df.select(['TYPE','NEIGHBOURHOOD'])
#We convert this column to Proper Case so it can be joined properly
crime_df = crime_df.withColumn("NEIGHBOURHOOD",upper(col("NEIGHBOURHOOD")))
crime_df = crime_df.groupBy('NEIGHBOURHOOD').count().withColumnRenamed('count', 'CRIME_COUNT')
crime_df.show(10)

+----------------+-----------+
|   NEIGHBOURHOOD|CRIME_COUNT|
+----------------+-----------+
|        MUSQUEAM|        560|
|       KITSILANO|      29038|
|        WEST END|      45268|
| WEST POINT GREY|       6365|
|    SOUTH CAMBIE|       5626|
|      KERRISDALE|       8026|
|        FAIRVIEW|      34654|
|HASTINGS-SUNRISE|      19838|
|          SUNSET|      18628|
|     SHAUGHNESSY|       5993|
+----------------+-----------+
only showing top 10 rows



### Now we join the datasets

In [33]:
#Create Temp tables in SPark.sql
heritage.createOrReplaceTempView("DF1")
crime_df.createOrReplaceTempView("DF2")

#SQL JOIN
joined_df = spark.sql("""SELECT DF1.*,
                      DF2.CRIME_COUNT  
                      FROM DF1 LEFT JOIN DF2 ON DF1.LocalArea = DF2.NEIGHBOURHOOD""")
joined_df.show(15,truncate=True)
print("The new Dataset has {} rows".format(joined_df.count()))

+---+---------------+--------------------+----------------+-----------------+-----------+
| ID|   BuildingName|           LocalArea|        Latitude|        Longitude|CRIME_COUNT|
+---+---------------+--------------------+----------------+-----------------+-----------+
|  1|   Name Unknown|CENTRL BUSNES/DWNTWN|49.2828738098464|-123.105225398754|       null|
|  2|   Name Unknown|KENSINGTON-CEDAR COT|49.2600769493209|-123.074046917561|       null|
|  3|   Name Unknown|         SHAUGHNESSY|49.2389033094729|-123.154237368533|       5993|
|  4|   Name Unknown|      MOUNT PLEASANT|49.2621356773046|-123.085603327158|      33786|
|  5|   Name Unknown|            FAIRVIEW| 49.261948887911|-123.136713198289|      34654|
|  6|   Name Unknown|           KITSILANO|49.2703621626003|-123.165116703005|      29038|
|  7|   Name Unknown|     WEST POINT GREY|49.2606925359374|-123.200353485935|       6365|
|  8|INFILL DWELLING|          STRATHCONA|49.2786667720091|-123.090587526348|      23566|
|  9|   Na

In [34]:
joined_df.repartition(1).write.format("com.databricks.spark.csv").option("header", "true").save("Heritage.csv")

## Here is the Tableau visualization:
The map shows crime density near heritage sites. The deeply red areas are those that house a lot of crime near heritage sites. The other markers represents heritage sites and highlights the severity of crime near them. Click on a marker to view the count of crime in the vicinity and the name of the heritage site.  The Tableau public dashboard can be found at <a href="https://public.tableau.com/views/HeritageSites_Crime/Dashboard1?:language=en&:display_count=y&publish=yes&:origin=viz_share_link">https://public.tableau.com/views/HeritageSites_Crime/Dashboard1?:language=en&:display_count=y&publish=yes&:origin=viz_share_link
</a><br>
<img src="../Visualisation/Raw/Heritage.png">
