In [4]:
from pyspark.sql import SparkSession # Import Spark Session

In [5]:
spark = SparkSession.builder\
                    .appName("Analyzing London Crime Data")\
                    .getOrCreate()
# Get Spark Session

In [6]:
data = spark.read\
            .format("csv")\
            .option("header","true")\
            .load("../datasets/london_crime_by_lsoa.csv")

# Load .csv file from the local in DataFrame named as 'Data'

In [7]:
data.count() # Total number of record present in loaded .csv file

13490604

In [14]:
spark

In [15]:
data.printSchema() # This method tells about schema of DataFrame

root
 |-- lsoa_code: string (nullable = true)
 |-- borough: string (nullable = true)
 |-- major_category: string (nullable = true)
 |-- minor_category: string (nullable = true)
 |-- value: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)



In [16]:
data.limit(10).show() # It will show the 'n' number of records given in limit(n) method

+---------+----------+--------------------+--------------------+-----+----+-----+
|lsoa_code|   borough|      major_category|      minor_category|value|year|month|
+---------+----------+--------------------+--------------------+-----+----+-----+
|E01001116|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|E01001646| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|E01000677|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|E01003774| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|E01004563|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
|E01001320|    Ealing|  Theft and Handling|         Other Theft|    0|2012|    5|
|E01001342|    Ealing|Violence Against ...|    Offensive Weapon|    0|2010|    7|
|E01002633|  Hounslow|             Robbery|   Personal Property|    0|2013|    4|
|E01003496|    Newham|     Criminal Damage|Criminal Damage T...|    0|2013|    9|
|E01004177|    S

In [17]:
data.dropna()

DataFrame[lsoa_code: string, borough: string, major_category: string, minor_category: string, value: string, year: string, month: string]

In [8]:
data=data.drop('lsoa_code') #it helps to drop the coulmn from dataFrame, we can drop data if it is not useful

In [9]:
data.limit(10).show()

+----------+--------------------+--------------------+-----+----+-----+
|   borough|      major_category|      minor_category|value|year|month|
+----------+--------------------+--------------------+-----+----+-----+
|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
|    Ealing|  Theft and Handling|         Other Theft|    0|2012|    5|
|    Ealing|Violence Against ...|    Offensive Weapon|    0|2010|    7|
|  Hounslow|             Robbery|   Personal Property|    0|2013|    4|
|    Newham|     Criminal Damage|Criminal Damage T...|    0|2013|    9|
|    Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
+----------+--------------------+--------------------+-----+----

In [26]:
total_boroughs = data.select('borough').distinct() #it returns distinct value of column borough
total_boroughs.show()

+--------------------+
|             borough|
+--------------------+
|             Croydon|
|          Wandsworth|
|              Bexley|
|             Lambeth|
|Barking and Dagenham|
|              Camden|
|           Greenwich|
|              Newham|
|       Tower Hamlets|
|            Hounslow|
|              Barnet|
|              Harrow|
|Kensington and Ch...|
|           Islington|
|               Brent|
|            Haringey|
|             Bromley|
|              Merton|
|         Westminster|
|             Hackney|
+--------------------+
only showing top 20 rows



In [31]:
hackney_borough = data.filter(data.borough=='Hackney')
hackney_borough.show()

+-------+--------------------+--------------------+-----+----+-----+
|borough|      major_category|      minor_category|value|year|month|
+-------+--------------------+--------------------+-----+----+-----+
|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
|Hackney|            Burglary|Burglary in a Dwe...|    2|2008|    5|
|Hackney|             Robbery|   Business Property|    0|2016|    7|
|Hackney|  Theft and Handling|Theft/Taking of P...|    0|2009|   12|
|Hackney|               Drugs|    Drug Trafficking|    0|2014|    4|
|Hackney|  Theft and Handling|Handling Stolen G...|    0|2014|    6|
|Hackney|            Burglary|Burglary in Other...|    0|2008|   12|
|Hackney|Violence Against ...| Ass

In [29]:
hackney_borough.count()

417744

In [35]:
hackney_borough_another_way = data.filter(data['borough']=='Hackney')
hackney_borough_another_way.limit(5).show()

+-------+--------------------+--------------------+-----+----+-----+
|borough|      major_category|      minor_category|value|year|month|
+-------+--------------------+--------------------+-----+----+-----+
|Hackney|     Criminal Damage|Criminal Damage T...|    0|2011|    6|
|Hackney|Violence Against ...|          Harassment|    1|2013|    2|
|Hackney|     Criminal Damage|Other Criminal Da...|    0|2011|    7|
|Hackney|Violence Against ...|        Wounding/GBH|    0|2013|   12|
|Hackney|  Theft and Handling|  Other Theft Person|    0|2016|    8|
+-------+--------------------+--------------------+-----+----+-----+



In [32]:
hackney_borough_another_way.count()

417744

In [38]:
year_15_16 = data.filter(data['year'].isin(['2015','2016'])) #data.filter(data.year.isin(['2015','2016']))
year_15_16.show()

+--------------------+--------------------+--------------------+-----+----+-----+
|             borough|      major_category|      minor_category|value|year|month|
+--------------------+--------------------+--------------------+-----+----+-----+
|             Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
|           Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|             Bromley|Violence Against ...|      Other violence|    0|2015|    5|
|           Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|              Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
|             Lambeth|Violence Against ...|      Other violence|    0|2015|    4|
|          Hillingdon|  Theft and Handling|Theft/Taking Of M...|    0|2016|    2|
|Kingston upon Thames|  Theft and Handling|    Theft From Shops|    0|2016|   11|
|            Haringey|Violence Against ...|        Wounding/GBH|    0|2015|   12|
|            Lew

In [40]:
year_15_16.sample(fraction=0.1).show() # fraction gives fraction part of your whole datasets

+--------------------+--------------------+--------------------+-----+----+-----+
|             borough|      major_category|      minor_category|value|year|month|
+--------------------+--------------------+--------------------+-----+----+-----+
|              Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
|           Southwark|               Drugs| Possession Of Drugs|    0|2015|    3|
|           Southwark|  Theft and Handling|    Theft From Shops|    4|2016|    8|
|             Croydon|             Robbery|   Personal Property|    0|2016|    1|
|Kensington and Ch...|Other Notifiable ...|    Other Notifiable|    0|2015|    5|
|             Lambeth|Violence Against ...|    Offensive Weapon|    0|2016|    4|
|          Hillingdon|Violence Against ...|      Other violence|    0|2016|   11|
|              Sutton|             Robbery|   Business Property|    0|2015|   10|
|             Lambeth|               Drugs|    Drug Trafficking|    0|2015|    9|
|              B

In [41]:
data_2014_onwards = data.filter(data['year']>='2014')
data_2014_onwards.sample(fraction=0.1).show()

+--------------------+--------------------+--------------------+-----+----+-----+
|             borough|      major_category|      minor_category|value|year|month|
+--------------------+--------------------+--------------------+-----+----+-----+
|Richmond upon Thames|             Robbery|   Personal Property|    0|2014|    1|
|            Haringey|Violence Against ...|        Wounding/GBH|    0|2015|   12|
|              Newham|     Criminal Damage|Criminal Damage T...|    0|2015|    1|
|Kensington and Ch...|Other Notifiable ...|    Other Notifiable|    0|2015|    5|
|      Waltham Forest|            Burglary|Burglary in Other...|    0|2015|    6|
|         Westminster|  Theft and Handling|Theft From Motor ...|    6|2016|    8|
|       Tower Hamlets|            Burglary|Burglary in a Dwe...|    0|2016|    3|
|Kingston upon Thames|               Drugs|    Drug Trafficking|    0|2014|    1|
|              Ealing|Other Notifiable ...|    Other Notifiable|    0|2016|   12|
|Hammersmith and

In [42]:
data_2014_onwards.count()

4496868

In [43]:
data_borough_count = data.groupBy('borough').count() # Total number of inspections
data_borough_count.show(10)

+--------------------+------+
|             borough| count|
+--------------------+------+
|             Croydon|602100|
|          Wandsworth|498636|
|              Bexley|385668|
|             Lambeth|519048|
|Barking and Dagenham|311040|
|              Camden|378432|
|           Greenwich|421200|
|              Newham|471420|
|       Tower Hamlets|412128|
|            Hounslow|395928|
+--------------------+------+
only showing top 10 rows



In [46]:
data_borough_sum_value = data.groupBy('borough').agg({'value':'sum'})
data_borough_sum_value.show()

+--------------------+----------+
|             borough|sum(value)|
+--------------------+----------+
|             Croydon|  260294.0|
|          Wandsworth|  204741.0|
|              Bexley|  114136.0|
|             Lambeth|  292178.0|
|Barking and Dagenham|  149447.0|
|              Camden|  275147.0|
|           Greenwich|  181568.0|
|              Newham|  262024.0|
|       Tower Hamlets|  228613.0|
|            Hounslow|  186772.0|
|              Barnet|  212191.0|
|              Harrow|  116848.0|
|Kensington and Ch...|  171981.0|
|           Islington|  230286.0|
|               Brent|  227551.0|
|            Haringey|  213272.0|
|             Bromley|  184349.0|
|              Merton|  115654.0|
|         Westminster|  455028.0|
|             Hackney|  217119.0|
+--------------------+----------+
only showing top 20 rows



In [52]:
data_borough_sum_value_rename = data.groupBy('borough')\
                                    .agg({'value':'sum'})\
                                    .withColumnRenamed('sum(value)','convictions')
data_borough_sum_value_rename.show()

+--------------------+-----------+
|             borough|convictions|
+--------------------+-----------+
|             Croydon|   260294.0|
|          Wandsworth|   204741.0|
|              Bexley|   114136.0|
|             Lambeth|   292178.0|
|Barking and Dagenham|   149447.0|
|              Camden|   275147.0|
|           Greenwich|   181568.0|
|              Newham|   262024.0|
|       Tower Hamlets|   228613.0|
|            Hounslow|   186772.0|
|              Barnet|   212191.0|
|              Harrow|   116848.0|
|Kensington and Ch...|   171981.0|
|           Islington|   230286.0|
|               Brent|   227551.0|
|            Haringey|   213272.0|
|             Bromley|   184349.0|
|              Merton|   115654.0|
|         Westminster|   455028.0|
|             Hackney|   217119.0|
+--------------------+-----------+
only showing top 20 rows



In [53]:
data_total_convistions = data_borough_sum_value_rename.agg({'convictions':'sum'})
data_total_convistions.show()

+----------------+
|sum(convictions)|
+----------------+
|       6447758.0|
+----------------+



In [56]:
data_total_convistions.collect()[0][0]

6447758.0

In [57]:
import pyspark.sql.functions as func

In [60]:
borough_percentage_contribution = data_borough_sum_value_rename\
.withColumn('% convictions',\
            func.round(data_borough_sum_value_rename.convictions/data_total_convistions.collect()[0][0]*100,2))
borough_percentage_contribution.printSchema()

root
 |-- borough: string (nullable = true)
 |-- convictions: double (nullable = true)
 |-- % convictions: double (nullable = true)



In [61]:
borough_percentage_contribution.show()

+--------------------+-----------+-------------+
|             borough|convictions|% convictions|
+--------------------+-----------+-------------+
|             Croydon|   260294.0|         4.04|
|          Wandsworth|   204741.0|         3.18|
|              Bexley|   114136.0|         1.77|
|             Lambeth|   292178.0|         4.53|
|Barking and Dagenham|   149447.0|         2.32|
|              Camden|   275147.0|         4.27|
|           Greenwich|   181568.0|         2.82|
|              Newham|   262024.0|         4.06|
|       Tower Hamlets|   228613.0|         3.55|
|            Hounslow|   186772.0|          2.9|
|              Barnet|   212191.0|         3.29|
|              Harrow|   116848.0|         1.81|
|Kensington and Ch...|   171981.0|         2.67|
|           Islington|   230286.0|         3.57|
|               Brent|   227551.0|         3.53|
|            Haringey|   213272.0|         3.31|
|             Bromley|   184349.0|         2.86|
|              Merto

In [64]:
borough_percentage_contribution.orderBy(borough_percentage_contribution[2].desc()).show()

+--------------------+-----------+-------------+
|             borough|convictions|% convictions|
+--------------------+-----------+-------------+
|         Westminster|   455028.0|         7.06|
|             Lambeth|   292178.0|         4.53|
|           Southwark|   278809.0|         4.32|
|              Camden|   275147.0|         4.27|
|              Newham|   262024.0|         4.06|
|             Croydon|   260294.0|         4.04|
|              Ealing|   251562.0|          3.9|
|           Islington|   230286.0|         3.57|
|       Tower Hamlets|   228613.0|         3.55|
|               Brent|   227551.0|         3.53|
|             Hackney|   217119.0|         3.37|
|            Lewisham|   215137.0|         3.34|
|            Haringey|   213272.0|         3.31|
|              Barnet|   212191.0|         3.29|
|          Hillingdon|   209680.0|         3.25|
|          Wandsworth|   204741.0|         3.18|
|      Waltham Forest|   203879.0|         3.16|
|             Enfiel

In [68]:
data_per_month_2014 = data.filter(data['year']==2014)\
                          .groupBy('month')\
                          .agg({'value':'sum'})\
                          .withColumnRenamed('sum(value)','convictions')
data_per_month_2014.show();

+-----+-----------+
|month|convictions|
+-----+-----------+
|    7|    58564.0|
|   11|    59704.0|
|    3|    57669.0|
|    8|    55641.0|
|    5|    56327.0|
|    6|    57039.0|
|    9|    56933.0|
|    1|    55515.0|
|   10|    60537.0|
|    4|    53467.0|
|   12|    57565.0|
|    2|    51222.0|
+-----+-----------+



In [10]:
data.limit(10).show()

+----------+--------------------+--------------------+-----+----+-----+
|   borough|      major_category|      minor_category|value|year|month|
+----------+--------------------+--------------------+-----+----+-----+
|   Croydon|            Burglary|Burglary in Other...|    0|2016|   11|
| Greenwich|Violence Against ...|      Other violence|    0|2016|   11|
|   Bromley|Violence Against ...|      Other violence|    0|2015|    5|
| Redbridge|            Burglary|Burglary in Other...|    0|2016|    3|
|Wandsworth|             Robbery|   Personal Property|    0|2008|    6|
|    Ealing|  Theft and Handling|         Other Theft|    0|2012|    5|
|    Ealing|Violence Against ...|    Offensive Weapon|    0|2010|    7|
|  Hounslow|             Robbery|   Personal Property|    0|2013|    4|
|    Newham|     Criminal Damage|Criminal Damage T...|    0|2013|    9|
|    Sutton|  Theft and Handling|Theft/Taking of P...|    1|2016|    8|
+----------+--------------------+--------------------+-----+----

In [16]:
data_major_category_convic = data.groupBy('major_category')\
                                 .agg({'value':'sum'})\
                                 .withColumnRenamed('sum(value)','convictions')
data_major_category_convic.show()
data_major_category_convic.orderBy(data_major_category_convic.convictions.desc()).show()

+--------------------+-----------+
|      major_category|convictions|
+--------------------+-----------+
|               Drugs|   470765.0|
|             Robbery|   258873.0|
|  Theft and Handling|  2661861.0|
|    Fraud or Forgery|     5325.0|
|Violence Against ...|  1558081.0|
|            Burglary|   754293.0|
|Other Notifiable ...|   106349.0|
|     Sexual Offences|     1273.0|
|     Criminal Damage|   630938.0|
+--------------------+-----------+

+--------------------+-----------+
|      major_category|convictions|
+--------------------+-----------+
|  Theft and Handling|  2661861.0|
|Violence Against ...|  1558081.0|
|            Burglary|   754293.0|
|     Criminal Damage|   630938.0|
|               Drugs|   470765.0|
|             Robbery|   258873.0|
|Other Notifiable ...|   106349.0|
|    Fraud or Forgery|     5325.0|
|     Sexual Offences|     1273.0|
+--------------------+-----------+



In [17]:
data_year = data.select('year')

In [18]:
data_year.count()

13490604

In [20]:
data_year.agg({'year':'min'}).show()

+---------+
|min(year)|
+---------+
|     2008|
+---------+



In [21]:
data_year.agg({'year':'max'}).show()

+---------+
|max(year)|
+---------+
|     2016|
+---------+



In [22]:
data_year.describe().show()

+-------+------------------+
|summary|              year|
+-------+------------------+
|  count|          13490604|
|   mean|            2012.0|
| stddev|2.5819889931674522|
|    min|              2008|
|    max|              2016|
+-------+------------------+

