d ##### Usecase:Get crimes data by type
###### Solutions are given for following problem statements
###### Problem statement-1:Get crimes data per day
###### Problem statement-2:Get crimes data per month
###### Problem statement-3:Get crimes data per year
###### Problem statement-4:Get TOP3 crime types based on number of incidents in RESIDENCE
###### Problem statement-5:Get yearly crimes data from 2001 to 2020 with count
###### Problem statement-6:Get yearly crime data of location BANK
###### Problem statement-7:Get yearly crimes data  of DOMESTIC incidents for all years with location 
###### Problem statement-8:Get yearly crimes data of crimetype BURGLARY happened in residence, park and apartments
###### Problem statement-9:Get yearly crimes data of THEFT incidents between 2015 and 2017
###### Problem statement-10:Get yearly crimes data of NARCOTICS incidents happened in AIRPORTS

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum,round,avg,rank, col,to_date,from_unixtime
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql.functions import unix_timestamp, lit
spark = SparkSession.builder.master('local').appName('getCrimesDatabyType').enableHiveSupport().getOrCreate()
spark.conf.set("spark.sql.shuffle.partitions",10)

In [3]:
crimesdatadf = spark.read.csv("/FileStore/tables/crimes_data.csv",header=True,inferSchema=True,sep=",").toDF('ID','CaseNumber','Date','Block','IUCR','CrimeType','Description',     'LocationDescription','Arrest','Domestic','Beat','District','Ward','CommunityArea','FBICode','XCoordinate','YCoordinate','Year','UpdatedOn','Latitude','Longitude','Location')
crimesdatadf.show(5)

In [4]:
crimesdatadf.printSchema()

In [5]:
crimesdataMap =crimesdatadf.select('ID','CaseNumber',to_date(from_unixtime(unix_timestamp('Date','MM/dd/yyyy'))).alias("CrimeDate"),
                                   'CrimeType','Description','LocationDescription','Arrest','Domestic','Year')
crimesdataMap.show(5,False)

###### Problem statement-1:Get crimes data per day

In [7]:
CrimesDataPerday = crimesdataMap.groupBy('CrimeDate','CrimeType').agg(count('CrimeType').alias("NumberofCrimePerTypes")).sort('CrimeDate','CrimeType')
CrimesDataPerDay.coalesce(1).write.csv("/FileStore/tables/CrimesDataPerDay", compression="none", header ='true')
CrimesDataPerday.show(20,False)

###### Problem statement-2:Get crimes data per month

In [9]:
crimesDataPerMonth = crimesdataMap.groupBy(substring('CrimeDate',0,7).alias("CrimeMonth"),'CrimeType').agg(count('CrimeType').alias("NumberofCrimePerTypes")).sort('CrimeMonth','CrimeType')
crimesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/crimesDataPerMonth", compression="none", header ='true')
crimesDataPerMonth.show(20,False)

###### Problem statement-3:Get crimes data per year

In [11]:
crimesDataPerYear = crimesdataMap.groupBy(substring('CrimeDate',0,4).alias("CrimeYear"),'CrimeType').agg(count('CrimeType').alias("NumberofCrimePerTypes")).sort('CrimeYear','CrimeType')
crimesDataPerYear.coalesce(1).write.csv("/FileStore/tables/crimesDataPerYear", compression="none", header ='true')
crimesDataPerYear.show(20,False)

###### Problem statement-4:Get TOP3 crime types based on number of incidents in RESIDENCE

In [13]:
top3crimetypes = crimesdataMap.groupBy('CrimeType','LocationDescription').agg(count('CrimeType').alias("NumberofIncidents")).\
                                                       filter(crimesdataMap.LocationDescription=="RESIDENCE").\
                                                       sort('NumberofIncidents',ascending=[0]).limit(3)
top3crimetypes.coalesce(1).write.csv("/FileStore/tables/top3crimetypes", compression="none", header ='true')
top3crimetypes.show(5,False)

###### Problem statement-5:Get yearly crimes data from 2001 to 2020 with count

In [15]:
NoofcrimesPerYear = crimesdataMap.groupBy(substring('CrimeDate',0,4).alias("CrimeYear")).agg(count('CrimeType').alias("NumberofCrimePeryear")).sort('CrimeYear')
NoofcrimesPerYear.coalesce(1).write.csv("/FileStore/tables/NoofcrimesPerYear", compression="none", header ='true')
NoofcrimesPerYear.show(25,False)

###### Problem statement-6:Get yearly crime data of location BANK

In [17]:
crimesDataofbanks = crimesdataMap.groupBy(substring('CrimeDate',0,4).alias("CrimeYear"),'CrimeType','LocationDescription'). \
                                  agg(count('CrimeType').alias("NumberofCrimePeryear")). \
                                  filter(crimesdataMap.LocationDescription=="BANK"). \
                                  sort('CrimeYear','CrimeType')
crimesDataofbanks.coalesce(1).write.csv("/FileStore/tables/crimesDataofbanks", compression="none", header ='true')
crimesDataofbanks.show(30,False)

###### Problem statement-7:Get yearly crimes data  of DOMESTIC incidents for all years with location

In [19]:
crimesDataofdomestic = crimesdataMap.groupBy(substring('CrimeDate',0,4).alias("CrimeYear"),'CrimeType','LocationDescription','Domestic'). \
                                  agg(count('CrimeType').alias("NumberofCrimes")). \
                                  filter(crimesdataMap.Domestic=="TRUE"). \
                                  sort('CrimeYear','CrimeType')
crimesDataofdomestic.coalesce(1).write.csv("/FileStore/tables/crimesDataofdomestic", compression="none", header ='true')
crimesDataofdomestic.show(20,False)

###### Problem statement-8:Get yearly crimes data of crimetype BURGLARY happened in residence, park and apartments

In [21]:
crimesDataofburglary = crimesdataMap.groupBy(substring('CrimeDate',0,4).alias("CrimeYear"),'CrimeType','LocationDescription'). \
                                  agg(count('CrimeType').alias("NumberofCrimes")). \
                                  where((crimesdataMap.CrimeType == "BURGLARY" ) & \
                                       ((crimesdataMap.LocationDescription.contains("RESIDENCE"))| \
                                       (crimesdataMap.LocationDescription.contains("PARK"))| \
                                       (crimesdataMap.LocationDescription.contains("APARTMENT")))). \
                                  sort('CrimeYear','LocationDescription')
crimesDataofburglary.coalesce(1).write.csv("/FileStore/tables/crimesDataofburglary", compression="none", header ='true')
crimesDataofburglary.show(30,False)

###### Problem statement-9:Get yearly crimes data of THEFT incidents between 2015 and 2017

In [23]:
crimesDataoftheft = crimesdataMap.groupBy('Year','CrimeType','LocationDescription'). \
                                  agg(count('CrimeType').alias("NumberofCrimes")). \
                                  where((crimesdataMap.CrimeType == "THEFT" ) & \
                                       ((crimesdataMap.Year>=2015)|(crimesdataMap.Year<=2017))).\
                                  sort('Year')
crimesDataoftheft.coalesce(1).write.csv("/FileStore/tables/crimesDataoftheft", compression="none", header ='true')
crimesDataoftheft.show(50,False)

###### Problem statement-10:Get monthly crimes data of NARCOTICS incidents happened in AIRPORTS

In [25]:
crimesDataofnarcotics = crimesdataMap.groupBy(substring('CrimeDate',0,7).alias("CrimeMonth"),'CrimeType','LocationDescription'). \
                                  agg(count('CrimeType').alias("NumberofCrimes")). \
                                  where((crimesdataMap.CrimeType == "NARCOTICS" ) & \
                                       (crimesdataMap.LocationDescription.contains("AIRPORT"))).\
                                  sort('CrimeMonth')
crimesDataofnarcotics.coalesce(1).write.csv("/FileStore/tables/crimesDataofnarcotics", compression="none", header ='true')
crimesDataofnarcotics.show(20,False)