# San Francisco Crime

## Leo CSV como RDD

In [None]:
import pyspark_csv as pycsv
sc.addPyFile('pyspark_csv.py')
plaintext_rdd = sc.textFile('sf/train.csv')
dataframe = pycsv.csvToDataFrame(sqlCtx, plaintext_rdd, parseDate=False)

In [None]:
data = dataframe.rdd
data

## Primer registro de los datos

In [None]:
data.first()

## Categorías de crímenes

In [None]:
categories = data.map(lambda row: row.Category).distinct()

In [None]:
categories.collect()

## Cantidad de categorías de crímenes

In [None]:
categories.count()

## Categorías de crímenes ordenadas alfabeticamente

In [None]:
categories.takeOrdered(40)

## 10 delitos más comunes

In [None]:
data.map(lambda row: (row.Category, 1))\
    .reduceByKey(lambda a, b: a+b)\
    .takeOrdered(10, key = lambda x: -x[1])

## Día de la semana con más casos de 'Driving under the influence'

In [None]:
data.filter(lambda row: row.Category == 'DRIVING UNDER THE INFLUENCE')\
    .map(lambda row: (row.DayOfWeek, 1))\
    .reduceByKey(lambda a, b: a+b)\
    .takeOrdered(3, key = lambda x: -x[1])

## Los 3 distritos con mayor cantidad de crímenes

In [None]:
data.map(lambda row: (row.PdDistrict, 1))\
    .reduceByKey(lambda a, b: a+b)\
    .takeOrdered(3, key = lambda x: -x[1])

## Crímenes que tienen mayor porcentaje de resolución “Not Prosecuted”

In [None]:
not_prosecuted = data.filter(lambda row: row.Resolution == 'NOT PROSECUTED')\
                     .map(lambda row: (row.Category, 1))\
                     .reduceByKey(lambda a, b: a+b)
not_prosecuted.collect()

In [None]:
all_resolutions = data.map(lambda row: (row.Category, 1))\
                      .reduceByKey(lambda a, b: a+b)

In [None]:
res = all_resolutions.join(not_prosecuted)
res.takeOrdered(5, key = lambda x: -(x[1][1]*100/x[1][0]))

In [None]:
# todo en uno

data.map(lambda row: (row.Category, (int(row.Resolution == "NOT PROSECUTED"), 1)))\
    .reduceByKey(lambda a,b: (a[0]+b[0],a[1]+b[1]))\
    .takeOrdered(5, key = lambda x: -(x[1][0]*100/x[1][1]))

## Delitos por día de la semana

In [None]:
day_of_week_crimes_rdd = data.map(lambda row: (row.DayOfWeek, 1))\
                             .reduceByKey(lambda a, b: a+b)

In [None]:
day_of_week_crimes = day_of_week_crimes_rdd.collect()
day_of_week_crimes

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np

days = [t[0] for t in day_of_week_crimes]
y_pos = np.arange(len(days))
crimes = [t[1] for t in day_of_week_crimes]

plt.barh(y_pos, crimes)
plt.yticks(y_pos, days)
plt.xlabel('Crimes')
plt.title('Crimenes por dia de la semana')

plt.show()

In [None]:
dataframe.registerTempTable("crimes")
sqlCtx.sql("select distinct Category from crimes").collect()

In [None]:
sqlCtx.sql("select distinct Category from crimes order by Category").collect()

## 10 delitos más comunes con SQL

In [None]:
sqlCtx.sql("select Category, count(*) as crimes_count from crimes group by Category order by crimes_count desc limit 10").collect()

## Día de la semana con más casos de 'Driving under the influence'

In [None]:
sqlCtx.sql("select DayOfWeek, count(*) as crimes_count from crimes where Category == 'DRIVING UNDER THE INFLUENCE' group by DayOfWeek order by crimes_count desc limit 1").collect()