d ##### Usecase:Get crimes data by type
###### Solutions are given for following problem statements
###### Problem statement-1:Get crimes data per day
###### Problem statement-2:Get crimes data per month
###### Problem statement-3:Get crimes data per year
###### Problem statement-4:Get TOP3 crime types based on number of incidents in RESIDENCE
###### Problem statement-5:Get yearly crimes data from 2001 to 2020 with count
###### Problem statement-6:Get yearly crime data of location BANK
###### Problem statement-7:Get yearly crimes data  of DOMESTIC incidents for all years with location 
###### Problem statement-8:Get yearly crimes data of crimetype BURGLARY happened in residence, park and apartments
###### Problem statement-9:Get yearly crimes data of THEFT incidents between 2015 and 2017
###### Problem statement-10:Get yearly crimes data of NARCOTICS incidents happened in AIRPORTS

In [2]:
from pyspark.sql import SQLContext, Row
from pyspark.sql import HiveContext
from pyspark.sql.functions import round,sum,avg,count,substring
hiveContext = HiveContext(sc)
sqlContext.sql("set spark.sql.shuffle.partitions=10")

In [3]:
hiveContext.sql("create database crime_db")
hiveContext.sql("use crime_db")

In [4]:
hiveContext.sql("create table crime('ID','CaseNumber','Date','Block','IUCR','PrimaryType','Description',locationDescription','Arrest','Domestic','Beat','District','Ward', \                                                                       'CommunityArea','FBICode','XCoordinate','YCoordinate','Year','UpdatedOn','Latitude','Longitude','Location') \
                                     row format delimited fields terminated by ','")

In [5]:
hiveContext.sql("load data local inpath 'dbfs:/FileStore/tables/crimes_data.csv' into table crime")

In [6]:
hiveContext.sql("show tables").show()

In [7]:
hiveContext.sql("select * from crime").show(5)

In [8]:
hiveContext.sql("select * from crime").printSchema()

###### Problem statement-1:Get crimes data per day

In [10]:
CrimesDataPerday=hiveContext.sql('''select to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))) as crimedate,
                                      crmtbl.PrimaryType as crime_Type, 
                                      count(crmtbl.PrimaryType) as NumberofCrimePerTypes
                                      from crime crmtbl 
                                      group by to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),crmtbl.PrimaryType 
                                      order by to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),crmtbl.PrimaryType''')
CrimesDataPerDay.coalesce(1).write.csv("/FileStore/tables/CrimesDataPerDay", compression="none", header ='true')
CrimesDataPerday.show(10,False)

###### Problem statement-2:Get crimes data per month

In [12]:
crimesDataPerMonth=hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,7) as crimemonth,           
                                                  crmtbl.PrimaryType as crime_Type, 
                                                  count(crmtbl.PrimaryType) as NumberofCrimePerTypes
                                                  from crime crmtbl 
                                                  group by crimemonth,crmtbl.PrimaryType 
                                                  order by crimemonth,crmtbl.PrimaryType''')
crimesDataPerMonth.coalesce(1).write.csv("/FileStore/tables/departmentsSalesDataPerMonth", compression="none", header ='true')
crimesDataPerMonth.show(20,False)

###### Problem statement-3:Get crimes data per year

In [14]:
crimesDataPerYear=hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,4) as crimeyear,           
                                                  crmtbl.PrimaryType as crime_Type, 
                                                  count(crmtbl.PrimaryType) as NumberofCrimePerTypes
                                                  from crime crmtbl 
                                                  group by crimeyear,crmtbl.PrimaryType 
                                                  order by crimeyear,crmtbl.PrimaryType''')
crimesDataPerYear.coalesce(1).write.csv("/FileStore/tables/crimesDataPerYear", compression="none", header ='true')
crimesDataPerYear.show(20,False)

###### Problem statement-4:Get TOP3 crime types based on number of incidents in RESIDENCE

In [16]:
top3crimetypes=sqlContext.sql('''select cmtl.PrimaryType as CrimeType, count(*) as NumberofIncidents from crime cmtl
                                  where cmtl.LocationDescription="RESIDENCE" 
                                  group by CrimeType 
                                  order by NumberofIncidents desc limit 3''')
top3crimetypes.coalesce(1).write.csv("/FileStore/tables/top3crimetypesr", compression="none", header ='true')
top3crimetypes.show(5,False)

###### Problem statement-5:Get yearly crimes data from 2001 to 2020 with count

In [18]:
NoofcrimesPerYear=hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,4) as crimeyear,           
                                                  count(crmtbl.PrimaryType) as NumberofCrimesPeryear
                                                  from crime crmtbl 
                                                  group by crimeyear 
                                                  order by crimeyear''')
NoofcrimesPerYear.coalesce(1).write.csv("/FileStore/tables/NoofcrimesPerYear", compression="none", header ='true')
NoofcrimesPerYear.show(25,False)

###### Problem statement-6:Get yearly crime data of location BANK

In [20]:
crimesDataofbanks = hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,4) as crimeyear,           
                                                  crmtbl.LocationDescription,crmtbl.PrimaryType as crimetype,count(crmtbl.PrimaryType) as NumberofCrimesPeryear
                                                  from crime crmtbl 
                                                  where crmtbl.LocationDescription = 'BANK'
                                                  group by crimeyear,crmtbl.LocationDescription,crmtbl.PrimaryType
                                                  order by crimeyear,crmtbl.LocationDescription,crmtbl.PrimaryType''')
crimesDataofbanks.coalesce(1).write.csv("/FileStore/tables/crimesDataofbanks", compression="none", header ='true')
crimesDataofbanks.show(30,False)

###### Problem statement-7:Get yearly crimes data  of DOMESTIC incidents for all years with location

In [22]:
crimesDataofdomestic = hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,4) as crimeyear,           
                                                  crmtbl.LocationDescription,count(crmtbl.PrimaryType) as NumberofCrimes
                                                  from crime crmtbl 
                                                  where Domestic = 'TRUE'
                                                  group by crimeyear,crmtbl.LocationDescription 
                                                  order by crimeyear,crmtbl.LocationDescription''')
crimesDataofdomestic.coalesce(1).write.csv("/FileStore/tables/crimesDataofdomestic", compression="none", header ='true')Domestic
crimesDataofdomestic.show(20,False)

###### Problem statement-8:Get yearly crimes data of crimetype BURGLARY happened in residence, park and apartments

In [24]:
crimesDataofburglary = hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,4) as crimeyear,           
                                                  crmtbl.PrimaryType as crimetype,crmtbl.LocationDescription,count(crmtbl.PrimaryType) as NumberofCrimes
                                                  from crime crmtbl 
                                                  where crmtbl.PrimaryType LIKE "%BURGLARY%" and 
                                                  (crmtbl.LocationDescription LIKE "%RESIDENCE%"or 
                                                    crmtbl.LocationDescription LIKE "PARK" or crmtbl.LocationDescription LIKE "APARTMENT")
                                                  group by crimeyear,crimetype,crmtbl.LocationDescription 
                                                  order by crimeyear,crimetype,crmtbl.LocationDescription''')
crimesDataofburglary.coalesce(1).write.csv("/FileStore/tables/crimesDataofburglary", compression="none", header ='true')Domestic
crimesDataofburglary.show(30,False)

###### Problem statement-9:Get yearly crimes data of THEFT incidents between 2015 and 2017

In [26]:
crimesDataoftheft = hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,4) as crimeyear,           
                                                  crmtbl.PrimaryType as crimetype,crmtbl.LocationDescription,count(crmtbl.PrimaryType) as NumberofCrimes
                                                  from crime crmtbl 
                                                  where crmtbl.PrimaryType LIKE "%THEFT%" and (crmtbl.Year>=2015 AND crmtbl.Year<=2017) 
                                                  group by crimeyear,crmtbl.PrimaryType,crmtbl.LocationDescription 
                                                  having count(crmtbl.PrimaryType)>=4
                                                  order by crimeyear''')
crimesDataoftheft.coalesce(1).write.csv("/FileStore/tables/crimesDataoftheft", compression="none", header ='true')
crimesDataoftheft.show(50,False)

###### Problem statement-10:Get monthly crimes data of NARCOTICS incidents happened in AIRPORTS

In [28]:
crimesDataofnarcotics = hiveContext.sql('''select substring(to_date(from_unixtime(UNIX_TIMESTAMP(crmtbl.Date,'MM/dd/yyyy'))),0,7) as crimemonth,           
                                                  crmtbl.PrimaryType as crimetype,crmtbl.LocationDescription,count(crmtbl.PrimaryType) as NumberofCrimes
                                                  from crime crmtbl 
                                                  where crmtbl.PrimaryType = "NARCOTICS" and crmtbl.LocationDescription LIKE "%AIRPORT%"
                                                  group by crimemonth,crmtbl.PrimaryType,crmtbl.LocationDescription
                                                  order by crimemonth''')
crimesDataofnarcotics.coalesce(1).write.csv("/FileStore/tables/crimesDataofnarcotics", compression="none", header ='true')
crimesDataofnarcotics.show(20,False)