# San Francisco Crime

## Leo CSV como RDD

In [1]:
import pyspark_csv as pycsv
sc.addPyFile('pyspark_csv.py')
plaintext_rdd = sc.textFile('sf/train.csv')
dataframe = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd, parseDate=False)

In [2]:
data = dataframe.rdd
data

PythonRDD[14] at RDD at PythonRDD.scala:43

## Primer registro de los datos

In [3]:
data.first()

Row(Dates=u'2015-05-13 23:53:00', Category=u'WARRANTS', Descript=u'WARRANT ARREST', DayOfWeek=u'Wednesday', PdDistrict=u'NORTHERN', Resolution=u'ARREST, BOOKED', Address=u'OAK ST / LAGUNA ST', X=-122.425891675136, Y=37.7745985956747)

## Categorías de crímenes

In [4]:
categories = data.map(lambda row: row.Category).distinct()

In [5]:
categories.collect()

[u'WEAPON LAWS',
 u'TREA',
 u'EMBEZZLEMENT',
 u'DRIVING UNDER THE INFLUENCE',
 u'BURGLARY',
 u'LIQUOR LAWS',
 u'MISSING PERSON',
 u'SEX OFFENSES NON FORCIBLE',
 u'EXTORTION',
 u'TRESPASS',
 u'LARCENY/THEFT',
 u'NON-CRIMINAL',
 u'GAMBLING',
 u'OTHER OFFENSES',
 u'DRUNKENNESS',
 u'VEHICLE THEFT',
 u'RECOVERED VEHICLE',
 u'FAMILY OFFENSES',
 u'WARRANTS',
 u'ASSAULT',
 u'BAD CHECKS',
 u'SEX OFFENSES FORCIBLE',
 u'PORNOGRAPHY/OBSCENE MAT',
 u'PROSTITUTION',
 u'BRIBERY',
 u'LOITERING',
 u'ROBBERY',
 u'ARSON',
 u'RUNAWAY',
 u'SUSPICIOUS OCC',
 u'VANDALISM',
 u'KIDNAPPING',
 u'DISORDERLY CONDUCT',
 u'SECONDARY CODES',
 u'STOLEN PROPERTY',
 u'FORGERY/COUNTERFEITING',
 u'FRAUD',
 u'DRUG/NARCOTIC',
 u'SUICIDE']

## Cantidad de categorías de crímenes

In [6]:
categories.count()

39

## Categorías de crímenes ordenadas alfabeticamente

In [7]:
categories.takeOrdered(40)

[u'ARSON',
 u'ASSAULT',
 u'BAD CHECKS',
 u'BRIBERY',
 u'BURGLARY',
 u'DISORDERLY CONDUCT',
 u'DRIVING UNDER THE INFLUENCE',
 u'DRUG/NARCOTIC',
 u'DRUNKENNESS',
 u'EMBEZZLEMENT',
 u'EXTORTION',
 u'FAMILY OFFENSES',
 u'FORGERY/COUNTERFEITING',
 u'FRAUD',
 u'GAMBLING',
 u'KIDNAPPING',
 u'LARCENY/THEFT',
 u'LIQUOR LAWS',
 u'LOITERING',
 u'MISSING PERSON',
 u'NON-CRIMINAL',
 u'OTHER OFFENSES',
 u'PORNOGRAPHY/OBSCENE MAT',
 u'PROSTITUTION',
 u'RECOVERED VEHICLE',
 u'ROBBERY',
 u'RUNAWAY',
 u'SECONDARY CODES',
 u'SEX OFFENSES FORCIBLE',
 u'SEX OFFENSES NON FORCIBLE',
 u'STOLEN PROPERTY',
 u'SUICIDE',
 u'SUSPICIOUS OCC',
 u'TREA',
 u'TRESPASS',
 u'VANDALISM',
 u'VEHICLE THEFT',
 u'WARRANTS',
 u'WEAPON LAWS']

## 10 delitos más comunes

In [8]:
data.map(lambda row: (row.Category, 1))\
    .reduceByKey(lambda a, b: a+b)\
    .takeOrdered(10, key = lambda x: -x[1])

[(u'LARCENY/THEFT', 174900),
 (u'OTHER OFFENSES', 126182),
 (u'NON-CRIMINAL', 92304),
 (u'ASSAULT', 76876),
 (u'DRUG/NARCOTIC', 53971),
 (u'VEHICLE THEFT', 53781),
 (u'VANDALISM', 44725),
 (u'WARRANTS', 42214),
 (u'BURGLARY', 36755),
 (u'SUSPICIOUS OCC', 31414)]

## Día de la semana con más casos de 'Driving under the influence'

In [9]:
data.filter(lambda row: row.Category == 'DRIVING UNDER THE INFLUENCE')\
    .map(lambda row: (row.DayOfWeek, 1))\
    .reduceByKey(lambda a, b: a+b)\
    .takeOrdered(3, key = lambda x: -x[1])

[(u'Saturday', 457), (u'Sunday', 442), (u'Friday', 352)]

## Los 3 distritos con mayor cantidad de crímenes

In [10]:
data.map(lambda row: (row.PdDistrict, 1))\
    .reduceByKey(lambda a, b: a+b)\
    .takeOrdered(3, key = lambda x: -x[1])

[(u'SOUTHERN', 157182), (u'MISSION', 119908), (u'NORTHERN', 105296)]

## Crímenes que tienen mayor porcentaje de resolución “Not Prosecuted”

In [None]:
not_prosecuted = data.filter(lambda row: row.Resolution == 'NOT PROSECUTED')\
                     .map(lambda row: (row.Category, 1))\
                     .reduceByKey(lambda a, b: a+b)
not_prosecuted.collect()

In [None]:
all_resolutions = data.map(lambda row: (row.Category, 1))\
                      .reduceByKey(lambda a, b: a+b)

In [None]:
res = all_resolutions.join(not_prosecuted)
res.takeOrdered(5, key = lambda x: -(x[1][1]*100/x[1][0]))

In [11]:
# todo en uno

data.map(lambda row: (row.Category, (int(row.Resolution == "NOT PROSECUTED"), 1)))\
    .reduceByKey(lambda a,b: (a[0]+b[0],a[1]+b[1]))\
    .takeOrdered(5, key = lambda x: -(x[1][0]*100/x[1][1]))

[(u'FORGERY/COUNTERFEITING', (698, 10609)),
 (u'FRAUD', (946, 16679)),
 (u'BAD CHECKS', (19, 406)),
 (u'GAMBLING', (4, 146)),
 (u'EMBEZZLEMENT', (21, 1166))]

## Delitos por día de la semana

In [None]:
day_of_week_crimes_rdd = data.map(lambda row: (row.DayOfWeek, 1))\
                             .reduceByKey(lambda a, b: a+b)

In [None]:
day_of_week_crimes = day_of_week_crimes_rdd.collect()
day_of_week_crimes

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

days = [t[0] for t in day_of_week_crimes]
y_pos = np.arange(len(days))
crimes = [t[1] for t in day_of_week_crimes]

plt.barh(y_pos, crimes)
plt.yticks(y_pos, days)
plt.xlabel('Crimes')
plt.title('Crimenes por dia de la semana')

plt.show()

In [None]:
dataframe.registerTempTable("crimes")
sqlCtx.sql("select distinct Category from crimes").collect()

In [None]:
sqlCtx.sql("select distinct Category from crimes order by Category").collect()

## 10 delitos más comunes con SQL

In [None]:
sqlCtx.sql("select Category, count(*) as crimes_count from crimes group by Category order by crimes_count desc limit 10").collect()

## Día de la semana con más casos de 'Driving under the influence'

In [None]:
sqlCtx.sql("select DayOfWeek, count(*) as crimes_count from crimes where Category == 'DRIVING UNDER THE INFLUENCE' group by DayOfWeek order by crimes_count desc limit 1").collect()