# Crime Analysis

#### Creating a SparkSession

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('Crime Analysis').getOrCreate()

#### Creating an RDD from the textfile

In [2]:
dataFile = spark.sparkContext.textFile("./data/Crimes_2001_to_Present.csv")

#### Extracting header

In [4]:
header = dataFile.first()

#### Excluding the header from the RDD

In [6]:
dataWOHeader = dataFile.filter(lambda x: x != header)

#### Extracting first 50K records for Analysis to a new RDD

In [7]:
dataWOHeaderNew = spark.sparkContext.parallelize(dataWOHeader.take(50000))

#### Python function to extract only the required columns crimeDate(YYYYMM) & crimeType

In [8]:
def fetchDateAndPrimaryType(rec):
    recItems = rec.split(",")
    inputDate = recItems[2]
    datetime = inputDate.split(" ")
    date = datetime[0].split("/")
    crimeDate = int(date[2]+date[0])
    crimeType = str(recItems[5])
    return(crimeDate,crimeType)

#### creating a DataFrame using the above function

In [9]:
crimeDF = dataWOHeaderNew.map(lambda x: fetchDateAndPrimaryType(x)).toDF(["crimeDate","crimeType"])

#### Creating a Temporary table for SQL analysis

In [10]:
crimeDF.registerTempTable("crime_data")

#### SQL query for finding the number of crimes spread over the various crime types

In [11]:
crimeResults = spark.sql("SELECT crimeDate,count(1) as Count, crimeType from crime_data group by crimeDate,crimeType order by crimeDate,Count desc ")

#### Saving the result DataFrame into a textfile with Gzip compression

In [13]:
crimeResults.rdd.map(lambda rec: "\t".join([str(x) for x in rec])).coalesce(1).saveAsTextFile("./data/crimeAnalysis.txt",compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")

#### Checking the saved textfile 

In [14]:
spark.sparkContext.textFile("./data/crimeAnalysis.txt").first()

'200101\t1\tOFFENSE INVOLVING CHILDREN'