### Preparing the environment / Preparando o ambiente

In [45]:
# pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org google-cloud-bigquery

In [85]:
# Importing Libraries / Importanto bibliotecas
import pandas as pd
from google.cloud import bigquery
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import geopandas as gp
import geojson
import plotly.offline as pyo 
pyo.init_notebook_mode(connected=True)

In [47]:
# Declaring the query / Declarando a consulta
query = """
    SELECT * FROM `bigquery-public-data.chicago_crime.crime` LIMIT 100000
"""

In [48]:
# Stating the GCP project / Declarando o projeto do GCP
bqclient = bigquery.Client(project="sz-lab-bq-2023-sandbox")


Your application has authenticated using end user credentials from Google Cloud SDK without a quota project. You might receive a "quota exceeded" or "API not enabled" error. See the following page for troubleshooting: https://cloud.google.com/docs/authentication/adc-troubleshooting/user-creds. 



In [49]:
query_job = bqclient.query(query) # Running the query we declared before / Executando a consulta que declaramos antes

In [50]:
# Then, we request the 'result' attribute, as the query response will be placed there
# A seguir solicitamos o atributo 'resultado', pois a resposta da consulta será colocada lá
resultado = query_job.result() 

In [51]:
resultado

<google.cloud.bigquery.table.RowIterator at 0x285d761f6a0>

In [52]:
df = resultado.to_dataframe()

In [53]:
df.head(2)

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,24130,JB396052,2018-08-16 02:32:00+00:00,002XX W 104TH PL,110,HOMICIDE,FIRST DEGREE MURDER,PORCH,False,False,...,34,49,01A,1176386.0,1835643.0,2018,2022-09-18 04:45:51+00:00,41.704354,-87.629714,"(41.704353647, -87.629713945)"
1,20371,HP624145,2012-04-17 05:17:00+00:00,003XX W 104TH PL,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,False,...,34,49,01A,1176012.0,1835633.0,2012,2022-09-18 04:45:51+00:00,41.704335,-87.631084,"(41.704334583, -87.631083766)"


### Data Cleaning

In [54]:
#Verifying if there is null values / Verificando se há valores nulos
df.isna().sum()

unique_key                 0
case_number                0
date                       0
block                      0
iucr                       0
primary_type               0
description                0
location_description     108
arrest                     0
domestic                   0
beat                       0
district                   2
ward                    9008
community_area          9014
fbi_code                   0
x_coordinate            1095
y_coordinate            1095
year                       0
updated_on                 0
latitude                1093
longitude               1093
location                1093
dtype: int64

In [55]:
# Droping null values/ Excluindo valores nulos
df = df.dropna()

In [56]:
#Checking again / Checando novamente
df.isna().sum()

unique_key              0
case_number             0
date                    0
block                   0
iucr                    0
primary_type            0
description             0
location_description    0
arrest                  0
domestic                0
beat                    0
district                0
ward                    0
community_area          0
fbi_code                0
x_coordinate            0
y_coordinate            0
year                    0
updated_on              0
latitude                0
longitude               0
location                0
dtype: int64

### Data Mining / Mineração de Dados

### 1. What are the 5 most recurring crimes? / Quais são os 5 crimes mais recorrentes?

In [57]:
df.head(2)

Unnamed: 0,unique_key,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,ward,community_area,fbi_code,x_coordinate,y_coordinate,year,updated_on,latitude,longitude,location
0,24130,JB396052,2018-08-16 02:32:00+00:00,002XX W 104TH PL,110,HOMICIDE,FIRST DEGREE MURDER,PORCH,False,False,...,34,49,01A,1176386.0,1835643.0,2018,2022-09-18 04:45:51+00:00,41.704354,-87.629714,"(41.704353647, -87.629713945)"
1,20371,HP624145,2012-04-17 05:17:00+00:00,003XX W 104TH PL,110,HOMICIDE,FIRST DEGREE MURDER,STREET,True,False,...,34,49,01A,1176012.0,1835633.0,2012,2022-09-18 04:45:51+00:00,41.704335,-87.631084,"(41.704334583, -87.631083766)"


In [58]:
#Creating the clustered database / Criando a base de dados agrupados
grouped = df.groupby(['primary_type'])
counts = grouped.size()
counts = counts.sort_values(ascending=False)
counts = counts[0:5].to_frame(name='Count').reset_index()
print(counts)

      primary_type  Count
0            THEFT  20959
1          BATTERY  16051
2  CRIMINAL DAMAGE  10348
3        NARCOTICS   7089
4          ASSAULT   5739


In [59]:
# Sorting by score / Ordenando pela contagem
counts = counts.sort_values(by='Count', ascending=True)

In [79]:
# Plotting the chart / Plotando o gráfico
fig = px.bar(counts, x="Count", y="primary_type", orientation='h',  width=650, height=400)
fig.show()

### 2. What time do these crimes happen more?
#### 2. Em que horários os crimes ocorrem mais?

In [61]:
# Plotting the chart / Plotando o gráfico
fig = px.line(df.groupby(df['date'].dt.hour)['unique_key'].count().reset_index(), x="date", y="unique_key", 
              title='Crime occurrence times',  width=800, height=400)
fig.show()

### 3. Is the incidence of crimes growing up or decreasing over the years?
#### 3. A incidência de crimes está crescendo ou diminuindo ao longo dos anos?

In [62]:
# Plotting the chart / Plotando o gráfico
fig = px.line(df.groupby(df['date'].dt.year)['unique_key'].count().reset_index(), x="date", y="unique_key",
               title='Crime occurrence per year',  width=1000, height=500)
fig.show()

### 4. Proportionately to their occurence, which crimes had most effectiveness in arrest?
#### 4. Porporcionalmente a suas ocorrencias, quais crimes tiveram mais efetividade de prisão?

In [63]:
# Grouping by count per class / Agrupando por contagem por classe
df_arrest = df.groupby(['primary_type','arrest'])['unique_key'].count().reset_index()

In [64]:
df_arrest.head()

Unnamed: 0,primary_type,arrest,unique_key
0,ARSON,False,131
1,ARSON,True,28
2,ASSAULT,False,4461
3,ASSAULT,True,1278
4,BATTERY,False,12205


In [65]:
# Calculating the relative percentage of each class in each categories
# Calcular a porcentagem relativa de cada classe em cada categoria
relative_perc = ((df.groupby('primary_type')['arrest'].value_counts() / df.groupby('primary_type')['arrest'].value_counts().groupby('primary_type').sum()) * 100).rename('%_rel').reset_index()


# Printing the result/ Imprimir o resultado
relative_perc.head()

Unnamed: 0,primary_type,arrest,%_rel
0,ARSON,False,82.389937
1,ARSON,True,17.610063
2,ASSAULT,False,77.731312
3,ASSAULT,True,22.268688
4,BATTERY,False,76.038876


In [66]:
# Filtering just the rows where the arrest ocurred / Criando um DF apenas com as linhas em que houve a prisão
relative_perc_true = relative_perc[relative_perc['arrest']==True].sort_values(by='%_rel', ascending= False)

In [90]:
# Plotting the chart / Plotando o gráfico
fig = px.bar(relative_perc_true, x="primary_type", y= '%_rel',title='Arrest relative %',  width=980, height=700)
fig.update_layout(
    title="Arrest relative %",
    xaxis_title="Crime",
    yaxis_title=r"arrest %"
)
fig.show()

##### 5. Which are the 5 local where crimes occur the most and what are their relative percentage comparing to the total of occurences?
###### 5. Quais os 5 tipos de locais onde há maior ocorrencia de crimes e qual a porcentagem relativa comparando com o total de ocorrências?

In [68]:
#Verifying the classes / Verificando as classes
df_tl = df['location_description'].value_counts(); df_tl

STREET                                 22476
RESIDENCE                              15149
APARTMENT                               8396
SIDEWALK                                7943
OTHER                                   3393
                                       ...  
LIQUOR STORE                               1
BASEMENT                                   1
VEHICLE - OTHER RIDE SERVICE               1
AIRPORT TRANSPORTATION SYSTEM (ATS)        1
TAVERN                                     1
Name: location_description, Length: 151, dtype: int64

In [69]:
df.groupby('location_description')['unique_key'].count()

location_description
ABANDONED BUILDING                                 122
AIRCRAFT                                            11
AIRPORT BUILDING NON-TERMINAL - NON-SECURE AREA     20
AIRPORT BUILDING NON-TERMINAL - SECURE AREA         12
AIRPORT EXTERIOR - NON-SECURE AREA                  19
                                                  ... 
VEHICLE-COMMERCIAL                                  62
VEHICLE-COMMERCIAL - ENTERTAINMENT/PARTY BUS         1
WAREHOUSE                                           93
WOODED AREA                                          1
YARD                                                 2
Name: unique_key, Length: 151, dtype: int64

In [70]:
# Creating the data frame which will show us the representativeness of each type of crime
# Criando o DF que irá nos mostrar a representatividade de cada tipo de crime
df_top_locals = ((df.groupby('location_description')['unique_key'].value_counts()/(df.groupby('location_description')['unique_key'].value_counts().sum()))*100).rename('%_rel').reset_index()
df_top_locals = df_top_locals.groupby('location_description')['%_rel'].sum().reset_index().sort_values('%_rel', ascending=False).reset_index().drop(['index'],axis=1)

In [71]:
df_top_locals.head(10)

Unnamed: 0,location_description,%_rel
0,STREET,24.968894
1,RESIDENCE,16.82923
2,APARTMENT,9.327231
3,SIDEWALK,8.823987
4,OTHER,3.76933
5,PARKING LOT/GARAGE(NON.RESID.),2.91837
6,SMALL RETAIL STORE,2.285149
7,RESIDENCE-GARAGE,2.118512
8,RESTAURANT,2.117401
9,ALLEY,1.965206


In [72]:
# Lets separate the top five of the other classes/ Vamos separar o top 5 das outras classes
df_top_locals.loc[4:len(df_top_locals), 'location_description'] = 'OTHERS'

In [73]:
df_top_locals

Unnamed: 0,location_description,%_rel
0,STREET,24.968894
1,RESIDENCE,16.829230
2,APARTMENT,9.327231
3,SIDEWALK,8.823987
4,OTHERS,3.769330
...,...,...
146,OTHERS,0.001111
147,OTHERS,0.001111
148,OTHERS,0.001111
149,OTHERS,0.001111


In [81]:
# Plotting the chart / Plotando o gráfico
fig = px.pie(df_top_locals, values='%_rel', names='location_description', color = 'location_description',
              color_discrete_map={'OTHERS':'RGB(115,111,76)',
                                 'SIDEWALK':'#511CFB',
                                 'APARTMENT':'royalblue',
                                 'RESIDENCE': '#1616A7' ,
                                 'STREET': '#0D2A63'},   width=650, height=400)
fig.update_layout(title_text='Places with highest incidence of crime', title_x=0.4)
fig.show()

#### 6. Draw a heatmap according to the incidence of crimes / Monte um mapa de calor de acordo com a incidência de crimes;

In [75]:
# importing the library / Importando a biblioteca
import folium
from folium.plugins import HeatMap

In [76]:
# Group the coordinates by frequency / Agrupando as coordenadas pela frequencia
freq = df.groupby(['latitude', 'longitude']).size().reset_index(name='count')

# Create a map centered on the mean of the coordinates / Criando um mapa centralizado na média das coordendas
center = [df['latitude'].mean(), df['longitude'].mean()]
m = folium.Map(location=center, zoom_start=5)

# Create a heat map layer with the frequency data / Criando o mapa de calor com os dados de frequencia
heat_data = [[row['latitude'], row['longitude'], row['count']] for index, row in freq.iterrows()]
HeatMap(heat_data).add_to(m)

# Display the map / Mostrar o mapa
m


### Insights

- As we can see, the criminal rate grows rapidly after 7:00;
  Unfortunately, the database doesn't provide us the information if this time is A.M. or P.M;
- The more recurrent class of crime was THEFT, and its arrest rate is just 11,34 %;
- The main places where crimes happened was in the streets;
- After 2003, the crime rate decreased quickly.

- Notamos que a taxa de criminalidade cresce rapidamente após as 7:00;
  Infelizmente, o banco de dados não nos fornece a informação se esta hora é A.M. ou PM;
- A classe de crime mais recorrente foi o ROUBO, e sua taxa de prisão é de apenas 11,34%;
- Os principais locais onde ocorreram os crimes foram as ruas;
- Após 2003, a taxa de criminalidade diminuiu rapidamente.