In [36]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, BooleanType, DateType
from pyspark.sql.functions import unix_timestamp, year, month, dayofmonth, dayofyear
from pyspark.ml.feature import StringIndexer

# DF preparation

In [24]:
train_path = "./data/train.csv"
test_path = "./data/test.csv"

In [4]:
spark = SparkSession.builder \
        .master("local") \
        .appName("chicago") \
        .config("spark.some.config.option", "some-value") \
        .getOrCreate()

In [41]:
train_schema = StructType([
    StructField("Dates", DateType(), True),
    StructField("Category", StringType(), True),
    StructField("Descript", StringType(), True),
    StructField("DayOfWeek", StringType(), True),
    StructField("PdDistrict", StringType(), True),
    StructField("Resolution", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("X", StringType(), True),
    StructField("Y", StringType(), True)])
train = spark.read.csv(train_path, header=True, schema=train_schema)
train.first()

Row(Dates=datetime.date(2015, 5, 13), Category=u'WARRANTS', Descript=u'WARRANT ARREST', DayOfWeek=u'Wednesday', PdDistrict=u'NORTHERN', Resolution=u'ARREST, BOOKED', Address=u'OAK ST / LAGUNA ST', X=u'-122.425891675136', Y=u'37.7745985956747')

In [42]:
category_indexer = StringIndexer(inputCol="Category", outputCol="CategoryIndex")
Descript_indexer = StringIndexer(inputCol="Descript", outputCol="DescriptIndex")
PdDistrict_indexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrictIndex")
Resolution_indexer = StringIndexer(inputCol="Resolution", outputCol="ResolutionIndex")
Address_indexer = StringIndexer(inputCol="Address", outputCol="AddressIndex")
DayOfWeek_indexer = StringIndexer(inputCol="DayOfWeek", outputCol="DayOfWeekIndex")

train = category_indexer.fit(train).transform(train)
train = Descript_indexer.fit(train).transform(train)
train = PdDistrict_indexer.fit(train).transform(train)
train = Resolution_indexer.fit(train).transform(train)
train = Address_indexer.fit(train).transform(train)
train = DayOfWeek_indexer.fit(train).transform(train)

train.first()

Row(Dates=datetime.date(2015, 5, 13), Category=u'WARRANTS', Descript=u'WARRANT ARREST', DayOfWeek=u'Wednesday', PdDistrict=u'NORTHERN', Resolution=u'ARREST, BOOKED', Address=u'OAK ST / LAGUNA ST', X=u'-122.425891675136', Y=u'37.7745985956747', CategoryIndex=7.0, DescriptIndex=5.0, PdDistrictIndex=2.0, ResolutionIndex=1.0, AddressIndex=3991.0, DayOfWeekIndex=1.0)

In [25]:
test_schema = StructType([
    StructField("Dates", DateType(), True),
    StructField("Category", StringType(), True),
    StructField("Descript", StringType(), True),
    StructField("DayOfWeek", StringType(), True),
    StructField("PdDistrict", StringType(), True),
    StructField("Address", StringType(), True),
    StructField("X", StringType(), True),
    StructField("Y", StringType(), True)])
test = spark.read.csv(test_path, header=True, inferSchema=True)
test.first()

Row(Id=0, Dates=datetime.datetime(2015, 5, 10, 23, 59), DayOfWeek=u'Sunday', PdDistrict=u'BAYVIEW', Address=u'2000 Block of THOMAS AV', X=-122.39958770418998, Y=37.7350510103906)

# EDA

## number of crimes of each year

In [28]:
train.groupBy(year("Dates")).count().orderBy("year(Dates)").show()

+-----------+-----+
|year(Dates)|count|
+-----------+-----+
|       2003|73902|
|       2004|73422|
|       2005|70779|
|       2006|69909|
|       2007|68015|
|       2008|70174|
|       2009|69000|
|       2010|66542|
|       2011|66619|
|       2012|71731|
|       2013|75606|
|       2014|74766|
|       2015|27584|
+-----------+-----+



## crime distribution of Addresses

In [33]:
train.groupBy("Address").count().orderBy("count", ascending=0).show(100000, truncate=False)

+--------------------------------------------+-----+
|Address                                     |count|
+--------------------------------------------+-----+
|800 Block of BRYANT ST                      |26533|
|800 Block of MARKET ST                      |6581 |
|2000 Block of MISSION ST                    |5097 |
|1000 Block of POTRERO AV                    |4063 |
|900 Block of MARKET ST                      |3251 |
|0 Block of TURK ST                          |3228 |
|0 Block of 6TH ST                           |2884 |
|300 Block of ELLIS ST                       |2703 |
|400 Block of ELLIS ST                       |2590 |
|16TH ST / MISSION ST                        |2504 |
|1000 Block of MARKET ST                     |2489 |
|1100 Block of MARKET ST                     |2319 |
|2000 Block of MARKET ST                     |2168 |
|100 Block of OFARRELL ST                    |2140 |
|700 Block of MARKET ST                      |2081 |
|3200 Block of 20TH AV                       |

## number of crimes for each category

In [34]:
train.groupBy("Category").count().orderBy("count", ascending=0).show(100000, truncate=False)

+---------------------------+------+
|Category                   |count |
+---------------------------+------+
|LARCENY/THEFT              |174900|
|OTHER OFFENSES             |126182|
|NON-CRIMINAL               |92304 |
|ASSAULT                    |76876 |
|DRUG/NARCOTIC              |53971 |
|VEHICLE THEFT              |53781 |
|VANDALISM                  |44725 |
|WARRANTS                   |42214 |
|BURGLARY                   |36755 |
|SUSPICIOUS OCC             |31414 |
|MISSING PERSON             |25989 |
|ROBBERY                    |23000 |
|FRAUD                      |16679 |
|FORGERY/COUNTERFEITING     |10609 |
|SECONDARY CODES            |9985  |
|WEAPON LAWS                |8555  |
|PROSTITUTION               |7484  |
|TRESPASS                   |7326  |
|STOLEN PROPERTY            |4540  |
|SEX OFFENSES FORCIBLE      |4388  |
|DISORDERLY CONDUCT         |4320  |
|DRUNKENNESS                |4280  |
|RECOVERED VEHICLE          |3138  |
|KIDNAPPING                 |2341  |
|

## crime distributions of Districts

In [35]:
train.groupBy("PdDistrict").count().orderBy("count", ascending=0).show(100000, truncate=False)

+----------+------+
|PdDistrict|count |
+----------+------+
|SOUTHERN  |157182|
|MISSION   |119908|
|NORTHERN  |105296|
|BAYVIEW   |89431 |
|CENTRAL   |85460 |
|TENDERLOIN|81809 |
|INGLESIDE |78845 |
|TARAVAL   |65596 |
|PARK      |49313 |
|RICHMOND  |45209 |
+----------+------+



## crime distribution of weekdays

In [43]:
train.groupBy("DayOfWeek").count().orderBy("count", ascending=0).show(100000, truncate=False)

+---------+------+
|DayOfWeek|count |
+---------+------+
|Friday   |133734|
|Wednesday|129211|
|Saturday |126810|
|Thursday |125038|
|Tuesday  |124965|
|Monday   |121584|
|Sunday   |116707|
+---------+------+

