<a href="https://colab.research.google.com/github/jugalpanchal/bd-chef/blob/main/spark_etl_crime_ansys_recipe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Spark Session

In [1]:
# Follow the steps to install the dependencies:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # install java
!wget -q https://downloads.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz # spark package download
!tar xf spark-3.1.2-bin-hadoop3.2.tgz # unzip spark package
!pip install -q findspark # install spark

# Set the location of Java and Spark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# create or get spark session
spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Spark_App1") \
        .getOrCreate()

sc = spark.sparkContext

### Read a file

In [3]:
data = spark.read\
            .format("csv")\
            .option("header", "true")\
            .load("datasets/london_crime_by_lsoa.csv")

data.count() #DataFrame count

13490604

In [4]:
# easy alternative
#data = spark.read.csv('datasets/london_crime_by_lsoa.csv', inferSchema=True, header=True)
data.printSchema()
data.show()
data.limit(5).show() # top 5

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)

+---------+--------------------+--------------------+--------------------+-----+----+-----+
|lsoa_code|             borough|      major_category|      minor_category|value|year|month|
+---------+--------------------+--------------------+--------------------+-----+----+-----+
|E01001116|             Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646|           Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|             Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774|           Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|          Wandsworth|             Robbery|   Personal Pro

### NA, Drop Column, Select, Distinct, Filter(==, isin, >=), Sample-Fraction 

In [5]:
data = data.dropna() # Drop NA values records(Rows)
data.count()

13490604

In [6]:
data.drop("lsoa_code")
total_borough = data.select("borough")\
    .distinct()

total_borough.show()

+--------------------+
|             borough|
+--------------------+
|             Croydon|
|          Wandsworth|
|              Bexley|
|             Lambeth|
|Barking and Dagenham|
|              Camden|
|           Greenwich|
|              Newham|
|       Tower Hamlets|
|            Hounslow|
|              Barnet|
|              Harrow|
|Kensington and Ch...|
|           Islington|
|               Brent|
|            Haringey|
|             Bromley|
|              Merton|
|         Westminster|
|             Hackney|
+--------------------+
only showing top 20 rows



In [7]:
hackney_data = data.filter(data['borough'] == 'Hackney')
hackney_data.show(5) # any 5

+---------+-------+--------------------+--------------------+-----+----+-----+
|lsoa_code|borough|      major_category|      minor_category|value|year|month|
+---------+-------+--------------------+--------------------+-----+----+-----+
|E01001786|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|E01001794|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|E01001787|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|E01001738|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|E01001807|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
+---------+-------+--------------------+--------------------+-----+----+-----+
only showing top 5 rows



In [8]:
data_2015_2016 = data.filter(data['year'].isin(['2015', '2016'])) 
#data_2015_2016.count() # 2997912
data_2015_2016_fra_per = data_2015_2016.sample(fraction=0.000002) # Ex 0.2 means 20% so it creates a new dataframe with 20% data from source
data_2015_2016_fra_per.show()

+---------+---------+--------------------+--------------------+-----+----+-----+
|lsoa_code|  borough|      major_category|      minor_category|value|year|month|
+---------+---------+--------------------+--------------------+-----+----+-----+
|E01003981|Southwark|     Criminal Damage|Criminal Damage T...|    0|2015|    8|
|E01004016|Southwark|    Fraud or Forgery|  Counted per Victim|    0|2016|    1|
|E01002696|Islington|Other Notifiable ...|    Other Notifiable|    1|2015|    9|
|E01002020| Haringey|             Robbery|   Business Property|    0|2016|    9|
|E01032834|Southwark|  Theft and Handling|    Theft From Shops|    0|2015|    4|
|E01003728|Redbridge|Violence Against ...|      Other violence|    1|2016|    1|
+---------+---------+--------------------+--------------------+-----+----+-----+



In [9]:
data_data_onwards = data.filter(data['year'] >= 2016)
data_data_onwards.sample(fraction = 0.1).show() #10%

+---------+--------------------+--------------------+--------------------+-----+----+-----+
|lsoa_code|             borough|      major_category|      minor_category|value|year|month|
+---------+--------------------+--------------------+--------------------+-----+----+-----+
|E01001600|           Greenwich|            Burglary|Burglary in Other...|    0|2016|   11|
|E01004093|              Sutton|Violence Against ...|        Wounding/GBH|    0|2016|    7|
|E01001806|             Hackney|             Robbery|   Business Property|    0|2016|    7|
|E01003128|             Lambeth|     Criminal Damage|Criminal Damage T...|    0|2016|    7|
|E01004476|          Wandsworth|     Criminal Damage|Criminal Damage T...|    0|2016|   12|
|E01001210|              Ealing|  Theft and Handling|Theft/Taking Of M...|    0|2016|    1|
|E01002365|            Havering|Violence Against ...|          Harassment|    1|2016|    8|
|E01002553|          Hillingdon|            Burglary|Burglary in Other...|    0|

### Aggregation - groupBy, agg, orderBy

In [10]:
borough_and_count = data.groupBy('borough')\
                        .count()

borough_and_count.show()

+--------------------+------+
|             borough| count|
+--------------------+------+
|             Croydon|602100|
|          Wandsworth|498636|
|              Bexley|385668|
|             Lambeth|519048|
|Barking and Dagenham|311040|
|              Camden|378432|
|           Greenwich|421200|
|              Newham|471420|
|       Tower Hamlets|412128|
|            Hounslow|395928|
|              Barnet|572832|
|              Harrow|365688|
|Kensington and Ch...|296784|
|           Islington|359208|
|               Brent|490644|
|            Haringey|413856|
|             Bromley|523908|
|              Merton|339876|
|         Westminster|366660|
|             Hackney|417744|
+--------------------+------+
only showing top 20 rows



In [11]:
# the sum is a method and the value is column in the dataframe.
#data = data.sample(fraction = 0.001)
borought_conviction_sum = data.groupBy('borough')\
                              .agg({"value":"sum"})\
                              .withColumnRenamed("sum(value)", "convictions")

borought_conviction_sum.show()

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|   260294.0|
|          Wandsworth|   204741.0|
|              Bexley|   114136.0|
|             Lambeth|   292178.0|
|Barking and Dagenham|   149447.0|
|              Camden|   275147.0|
|           Greenwich|   181568.0|
|              Newham|   262024.0|
|       Tower Hamlets|   228613.0|
|            Hounslow|   186772.0|
|              Barnet|   212191.0|
|              Harrow|   116848.0|
|Kensington and Ch...|   171981.0|
|           Islington|   230286.0|
|               Brent|   227551.0|
|            Haringey|   213272.0|
|             Bromley|   184349.0|
|              Merton|   115654.0|
|         Westminster|   455028.0|
|             Hackney|   217119.0|
+--------------------+-----------+
only showing top 20 rows



In [12]:
borought_conviction_sum_total = borought_conviction_sum.agg({"convictions":"sum"})
borought_conviction_sum_total.show()
total_convictions = borought_conviction_sum_total.collect()[0][0] # 0 row and 0 column

+----------------+
|sum(convictions)|
+----------------+
|       6447758.0|
+----------------+



In [14]:
import pyspark.sql.functions as func
from pyspark.sql.functions import lit # literal

# Somehow it is not working.
borought_conviction_per = borought_conviction_sum.withColumn("contribution percentage(%)", 
                                                             func.round(borought_conviction_sum.convictions / total_convictions * 100, 2)\
                                                             )
borought_conviction_per.show(5)

+--------------------+-----------+--------------------------+
|             borough|convictions|contribution percentage(%)|
+--------------------+-----------+--------------------------+
|             Croydon|   260294.0|                      4.04|
|          Wandsworth|   204741.0|                      3.18|
|              Bexley|   114136.0|                      1.77|
|             Lambeth|   292178.0|                      4.53|
|Barking and Dagenham|   149447.0|                      2.32|
+--------------------+-----------+--------------------------+
only showing top 5 rows



In [16]:
# 2 column - contribution percentage(%)
borought_conviction_per.orderBy(borought_conviction_per[2].desc())\
                        .show()

+--------------------+-----------+--------------------------+
|             borough|convictions|contribution percentage(%)|
+--------------------+-----------+--------------------------+
|         Westminster|   455028.0|                      7.06|
|             Lambeth|   292178.0|                      4.53|
|           Southwark|   278809.0|                      4.32|
|              Camden|   275147.0|                      4.27|
|              Newham|   262024.0|                      4.06|
|             Croydon|   260294.0|                      4.04|
|              Ealing|   251562.0|                       3.9|
|           Islington|   230286.0|                      3.57|
|       Tower Hamlets|   228613.0|                      3.55|
|               Brent|   227551.0|                      3.53|
|             Hackney|   217119.0|                      3.37|
|            Lewisham|   215137.0|                      3.34|
|            Haringey|   213272.0|                      3.31|
|       

In [20]:
conviction_monthly = data.filter(data['year'] == 2014)\
                          .groupBy('month')\
                          .agg({"value":"sum"})\
                          .withColumnRenamed("sum(value)", "convictions")

conviction_monthly.show()

+-----+-----------+
|month|convictions|
+-----+-----------+
|    7|    58564.0|
|   11|    59704.0|
|    3|    57669.0|
|    8|    55641.0|
|    5|    56327.0|
|    6|    57039.0|
|    9|    56933.0|
|    1|    55515.0|
|   10|    60537.0|
|    4|    53467.0|
|   12|    57565.0|
|    2|    51222.0|
+-----+-----------+

