In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from scipy import stats
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-0.23.2-cp38-cp38-win_amd64.whl (6.8 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.23.1
    Uninstalling scikit-learn-0.23.1:
      Successfully uninstalled scikit-learn-0.23.1
Successfully installed scikit-learn-0.23.2


In [2]:
# Rodando o dataset
police0 = pd.read_csv('police_use_of_force.csv')
police0.head()

Unnamed: 0,X,Y,PoliceUseOfForceID,CaseNumber,ResponseDate,Problem,Is911Call,PrimaryOffense,SubjectInjury,ForceReportNumber,...,TotalCityCallsForYear,TotalPrecinctCallsForYear,TotalNeighborhoodCallsForYear,CenterGBSID,CenterLatitude,CenterLongitude,CenterX,CenterY,DateAdded,OBJECTID
0,-93.273141,44.980808,15928382,08-000149,2008/01/01 01:29:47+00,Code 3,No,MISC,,1,...,322402,46998.0,23458,17121,44.980808,-93.273141,-10383120.0,5618501.0,2020/11/02 08:18:49+00,1
1,-93.251092,44.961813,15928383,08-012774,2008/01/13 03:21:52+00,Suspicious Vehicle,No,FLEE,No,2,...,322402,84018.0,10316,17023,44.961813,-93.251092,-10380660.0,5615512.0,2020/11/02 08:18:49+00,2
2,-93.266112,44.974295,15928384,08-019237,2008/01/20 03:47:57+00,Unwanted Person,Yes,OBSTRU,No,3,...,322402,46998.0,23458,21739,44.974295,-93.266112,-10382340.0,5617476.0,2020/11/02 08:18:49+00,3
3,-93.295639,45.000883,15928385,08-030704,2008/02/01 06:15:20+00,Attempt Pick-Up,No,CHLDAB,,2,...,322402,80434.0,15344,22445,45.000883,-93.295639,-10385620.0,5621661.0,2020/11/02 08:18:49+00,4
4,-93.290726,45.013029,15928386,08-038956,2008/02/09 03:49:09+00,Neighbor Trouble,Yes,OBSTRU,Yes,2,...,322402,80434.0,13679,25902,45.013029,-93.290726,-10385080.0,5623573.0,2020/11/02 08:18:49+00,5


In [3]:
police0.shape

(33257, 30)

# Minerando dados

In [4]:
# Filtrando o dataset para remover entradas indesejdas cujos resultados não são proveitosos para o modelo:
police1 = police0.drop(columns=['X', 'Y', 'ForceTypeAction', 'PoliceUseOfForceID', 'CaseNumber', 'ResponseDate', 'ForceReportNumber', 'SubjectRole', 'SubjectRoleNumber', 'CenterGBSID', 'OBJECTID', 'DateAdded', 'CenterX', 'CenterY', 'CenterLatitude', 'CenterLongitude'])

**Variáveis retiradas**
* Dados relacionados a latitude e longitude foram desconsiderados, uma vez que para a análise os dados de localização baseado em distrito policial e bairro são mais proveitosos para que a análise não se torne muito complexa, uma vez que o intuito do modelo não é ver atitudes policiais em pontos muito específicos.
* As features PoliceUseOfForceID, CaseNumber, ResponseDate, ForceReportNumber, SubjectRole, SubjectRoleNumber, CenterGBSID, OBJECTID, DateAdded foram desconsideradas por se tratarem de códigos policiais que não são necessários para a análise.
* A feature ForceTypeAction foi desconsiderada pois será considerado os casos genéricos de "agressões" na feature ForceType.

In [5]:
police1.head()

Unnamed: 0,Problem,Is911Call,PrimaryOffense,SubjectInjury,ForceType,Race,Sex,EventAge,TypeOfResistance,Precinct,Neighborhood,TotalCityCallsForYear,TotalPrecinctCallsForYear,TotalNeighborhoodCallsForYear
0,Code 3,No,MISC,,Bodily Force,White,Male,39.0,Commission of Crime,1,Downtown West,322402,46998.0,23458
1,Suspicious Vehicle,No,FLEE,No,Bodily Force,Black,Male,30.0,Fled in Vehicle,3,Ventura Village,322402,84018.0,10316
2,Unwanted Person,Yes,OBSTRU,No,Bodily Force,Black,Male,40.0,Commission of Crime,1,Downtown West,322402,46998.0,23458
3,Attempt Pick-Up,No,CHLDAB,,Bodily Force,Black,Female,35.0,Commission of Crime,4,Jordan,322402,80434.0,15344
4,Neighbor Trouble,Yes,OBSTRU,Yes,Bodily Force,Black,Male,46.0,Tensed,4,Hawthorne,322402,80434.0,13679


**Descrevendo variáveis restantes**

* Problem: Tipo de problema
* Is911Call: se o caso foi denúncia por ligação
    - No: Não foi denúncia por ligação
    - Yes: foi denúncia por ligação
* PrimaryOffense: Qual a primeira passagem pela polícia
* SubjectInjury: Se o indivíduo já estava machucado antes da intervenção policial
    - No: Não estava machucado
    - Yes: Estava machucado
* ForceType: Tipo de agressão
    - Bodily Force                  
    - Taser                         
    - Chemical Irritant              
    - Gun Point Display              
    - Improvised Weapon              
    - Police K9 Bite                 
    - Baton                          
    - Firearm                       
    - Maximal Restraint Technique    
    - Less Lethal Projectile         
    - Less Lethal      
* Race: Tipo de raça
* Sex: Tipo de sexo
* EventAge: Idade do indivíduo
* TypeOfResistance: 
* Precinct: Distrito Policial
    - 01
    - 02
    - 03
    - 04
    - 05
* Neighborhood: Bairro
* TotalCityCallsForYear: Total de ligação por ano da cidade
* TotalPrecinctCallsForYear: Total de ligações por ano do distrito
* TotalNeighborhoodCallsForYear: Total de ligações por ano do bairro

Iremos usar a variável *ForceType* como target, pois ela nos dá os tipos de força usada por policiais.

## Tabelas comparativas entre a target e as demais features considerando todos os valores

In [6]:
# Tabela cruzada para Raça:
pd.crosstab(police1['ForceType'], police1['Race'], normalize = 'index').round(3)

Race,Asian,Black,Native American,Other / Mixed Race,Pacific Islander,Unknown,White,not recorded
ForceType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Baton,0.034,0.603,0.052,0.069,0.0,0.052,0.19,0.0
Bodily Force,0.013,0.615,0.054,0.036,0.0,0.017,0.258,0.007
Chemical Irritant,0.022,0.651,0.029,0.043,0.0,0.054,0.166,0.035
Firearm,0.023,0.523,0.023,0.0,0.0,0.0,0.205,0.227
Gun Point Display,0.018,0.699,0.064,0.057,0.0,0.0,0.155,0.007
Improvised Weapon,0.003,0.669,0.057,0.037,0.0,0.014,0.22,0.0
Less Lethal,0.0,0.104,0.104,0.0,0.0,0.438,0.354,0.0
Less Lethal Projectile,0.0,0.438,0.062,0.125,0.0,0.0,0.375,0.0
Maximal Restraint Technique,0.0,0.618,0.056,0.0,0.014,0.049,0.264,0.0
Police K9 Bite,0.007,0.635,0.072,0.041,0.0,0.007,0.232,0.007


In [7]:
# Tabela para Sexo:
pd.crosstab(police1['ForceType'], police1['Sex'], normalize = 'index').round(3)

Sex,Female,Male,Unknown,not recorded
ForceType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Baton,0.069,0.897,0.034,0.0
Bodily Force,0.114,0.88,0.003,0.003
Chemical Irritant,0.256,0.684,0.027,0.032
Firearm,0.045,0.864,0.0,0.091
Gun Point Display,0.014,0.986,0.0,0.0
Improvised Weapon,0.089,0.911,0.0,0.0
Less Lethal,0.042,0.688,0.271,0.0
Less Lethal Projectile,0.0,1.0,0.0,0.0
Maximal Restraint Technique,0.208,0.792,0.0,0.0
Police K9 Bite,0.017,0.976,0.003,0.003


In [8]:
# Tabela para tipo de resistência:
pd.crosstab(police1['ForceType'], police1['TypeOfResistance'], normalize = 'index').round(3)

TypeOfResistance,Assaulted Officer,Assaulted Officer,Assaulted Police Horse,Assaulted Police K9,Assaulting Police Horse,Assaulting Police K9,COMMISSION OF CRIME,Commission of Crime,Commission of a Crime,Fled in Vehicle,...,Other,TENSED,Tensed,Tensed,Unspecified,Verbal Non-Compliance,Verbal Non-Compliance,commission of crime,tensed,verbal non-compliance
ForceType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baton,0.103,0.017,0.0,0.0,0.0,0.0,0.0,0.345,0.0,0.017,...,0.0,0.0,0.155,0.0,0.103,0.069,0.0,0.0,0.0,0.0
Bodily Force,0.104,0.021,0.0,0.0,0.002,0.0,0.0,0.148,0.018,0.026,...,0.032,0.0,0.275,0.049,0.068,0.07,0.017,0.0,0.0,0.0
Chemical Irritant,0.034,0.008,0.0,0.0,0.002,0.0,0.0,0.393,0.042,0.003,...,0.03,0.0,0.104,0.007,0.104,0.199,0.028,0.004,0.0,0.0
Firearm,0.209,0.0,0.0,0.0,0.0,0.0,0.0,0.372,0.0,0.023,...,0.0,0.0,0.14,0.0,0.163,0.047,0.0,0.0,0.0,0.0
Gun Point Display,0.055,0.0,0.0,0.0,0.0,0.0,0.0,0.269,0.0,0.077,...,0.0,0.002,0.169,0.0,0.091,0.148,0.0,0.0,0.0,0.0
Improvised Weapon,0.134,0.02,0.0,0.0,0.003,0.0,0.0,0.178,0.015,0.087,...,0.02,0.0,0.155,0.0,0.05,0.076,0.006,0.0,0.0,0.0
Less Lethal,0.0,0.578,0.022,0.0,0.0,0.0,0.0,0.0,0.089,0.0,...,0.244,0.0,0.0,0.0,0.0,0.0,0.022,0.0,0.0,0.0
Less Lethal Projectile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.438,0.0,0.0,...,0.0,0.0,0.0,0.0,0.188,0.188,0.0,0.0,0.0,0.0
Maximal Restraint Technique,0.0,0.191,0.0,0.0,0.0,0.0,0.0,0.0,0.061,0.0,...,0.351,0.0,0.0,0.214,0.0,0.0,0.107,0.0,0.0,0.0
Police K9 Bite,0.014,0.0,0.0,0.0,0.0,0.024,0.0,0.14,0.01,0.16,...,0.007,0.0,0.01,0.0,0.126,0.048,0.003,0.0,0.0,0.0


In [17]:
# Tabela para Idade:
pd.crosstab(police1['ForceType'], police1['EventAge'], normalize = 'index').round(3)

EventAge,0.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,...,64.0,65.0,66.0,67.0,70.0,71.0,72.0,73.0,74.0,82.0
ForceType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Baton,0.018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055,0.036,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bodily Force,0.004,0.0,0.0,0.0,0.0,0.004,0.007,0.014,0.021,0.025,...,0.001,0.001,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Chemical Irritant,0.054,0.0,0.0,0.0,0.0,0.004,0.008,0.017,0.013,0.014,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Firearm,0.167,0.0,0.0,0.0,0.0,0.024,0.0,0.0,0.071,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gun Point Display,0.0,0.0,0.0,0.0,0.0,0.005,0.008,0.011,0.019,0.027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Improvised Weapon,0.003,0.0,0.0,0.0,0.0,0.0,0.013,0.013,0.032,0.016,...,0.0,0.0,0.003,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Less Lethal,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Less Lethal Projectile,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Maximal Restraint Technique,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Police K9 Bite,0.004,0.0,0.0,0.0,0.007,0.0,0.019,0.026,0.048,0.048,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**Limpeza de *outliers***

In [20]:
# Fazendo limpeza para tirar os resultados indeterminados (Nan)
police2 = police1.copy()
police2.dropna(inplace=True)
police2.shape

(19262, 14)

In [21]:
police2.head()

Unnamed: 0,Problem,Is911Call,PrimaryOffense,SubjectInjury,ForceType,Race,Sex,EventAge,TypeOfResistance,Precinct,Neighborhood,TotalCityCallsForYear,TotalPrecinctCallsForYear,TotalNeighborhoodCallsForYear
1,Suspicious Vehicle,No,FLEE,No,Bodily Force,Black,Male,30.0,Fled in Vehicle,3,Ventura Village,322402,84018.0,10316
2,Unwanted Person,Yes,OBSTRU,No,Bodily Force,Black,Male,40.0,Commission of Crime,1,Downtown West,322402,46998.0,23458
4,Neighbor Trouble,Yes,OBSTRU,Yes,Bodily Force,Black,Male,46.0,Tensed,4,Hawthorne,322402,80434.0,13679
5,Domestic Abuse-In Progress,Yes,DASLT5,No,Bodily Force,Black,Male,36.0,Tensed,4,Cleveland,322402,80434.0,2992
6,Suspicious Person,No,DISCON,No,Bodily Force,Black,Male,34.0,Tensed,1,Downtown West,322402,46998.0,23458


A limpeza acima não interfere significativamente nos resultados, já que ainda continuamo com muitos dados (19262).

## Tabelas comparativas entre a target e as demais features depois da limpeza

In [None]:
# Tabela para Raça:
pd.crosstab(police2['ForceType'], police2['Race'], normalize = 'index').round(3)

In [None]:
# Tabela para Sexo:
pd.crosstab(police2['ForceType'], police2['Sex'], normalize = 'index').round(3)

In [None]:
# Tabela para tipo de resistência:
pd.crosstab(police2['ForceType'], police2['TypeOfResistance'], normalize = 'index').round(3)

In [None]:
# Tabela para Idade:
pd.crosstab(police2['ForceType'], police2['EventAge'], normalize = 'index').round(3)

**Depois de analisar os dados, percebe-se que os valores não mudam significativamente depois da limpeza dos valores Nan. Logo, a limpeza mostra-se proveitosa.**

In [None]:
# Frequência de cada tipo de força da target
police2['ForceType'].value_counts(True)

In [None]:
lista_ft = []
for name in list(police2['ForceType']):
    if name not in lista_ft:
        lista_ft.append(name)
        
lista_ft

### Análise Expoloratória 

In [None]:
census_2010 = {'Asian': '0.061', 'Black': '0.194', 'Native American': '0.014', 'Other / Mixed Race': '0.152', 'White': '0.638'}
census_2010_df = pd.DataFrame.from_dict(census_2010, orient='index')
census_2010_df.columns = ['Percentage']
census_2010_df['Percentage'] = census_2010_df['Percentage'].astype(float)

In [None]:
census_2010_df = census_2010_df.sort_index(ascending=True)
census_2010_df.style.background_gradient(cmap='Purples', subset=['Percentage'])


In [None]:
force_race = police2.groupby(['Race'])[['ForceType']].count().reset_index()
force_race['ForceType'] = force_race['ForceType']/len(police2['ForceType'])
force_race.sort_values(by='ForceType', ascending=False).style.background_gradient(cmap='summer', subset=['ForceType'])

In [None]:

tmp = force_race[force_race.Race != 'Unknown'].copy()
tmp.loc[~tmp.Race.isin(['White','Black','Asian','Native American']), 'Race'] = 'Other / Mixed Race'
tmp = tmp.groupby(['Race'], as_index=False)['ForceType'].sum()
tmp

In [None]:
X = np.arange(len(census_2010_df))
plt.rcParams['xtick.labelsize'] = 10 # Definindo tamanho da fonte do título do eixo x
plt.rcParams['ytick.labelsize'] = 10 # Definindo tamanho da fonte do título do eixo y
plt.figure(figsize=(8, 5)) # Definindo tamanho do gráfico
ax = plt.subplot(111)
ax.bar(X, census_2010.values(), width=0.2, color='b', align='center')
ax2 = ax.twinx()
ax2.bar(X-0.2, tmp.ForceType, width=0.2, color='g', align='center')
ax.legend('População')
ax2.legend('Uso de força')
plt.xticks(X, census_2010.keys())
plt.title("Uso de força por raça", fontsize=17)
plt.show()

In [None]:
police2['Race'].value_counts()

**Verificando graficamente, como se comporta a target**

In [None]:
# Data:
labels = lista_ft
frequency = []
for i in lista_ft:
    frequency.append(police2['ForceType'].value_counts()[i])

In [None]:
# Plot template
plt.figure(figsize=(8,4))
plt.title('Frequência absoluta dos tipos de forças'); plt.xlabel('Quantidade')
# Plot 
plt.barh(labels, frequency, color='#0099ff')
plt.show()

*Pelo gráfico acima, é possível identificar que o tipo de violência policial mais comum é a causada por força corporal, fato que pode ser constatado por acontecimentos recentes como o caso de George Floyd que foi sufocado por um policial: https://g1.globo.com/mundo/noticia/2020/05/29/preso-policial-suspeito-de-participar-da-morte-de-george-floyd-em-minneapolis-diz-imprensa-dos-eua.ghtml*

**Análise qualitativa da feature Race e a target ForceType**

In [None]:
#Analisando a frequência do tipo de violência (target ForceType) considerando Raça (feature Race):
cross = pd.crosstab(police2["Race"], police2["ForceType"], normalize="index")
plot = cross.plot(kind='bar', stacked=True,title='Violência por Raça')
plt.ylabel('Freq.Relativa (em%)')  

 

#Para legenda ficar fora do gráfico
ax = plt.subplot(111)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

*O gráfico permite entender que BodilyForce é o tipo mais comum de violência policial entre as raças.*

In [None]:
#Não seria bom tirar o pacific Islander??

**Análise qualitativa da feature Sex e a target ForceType**

In [None]:
#Analisando a frequência do tipo de violência (target ForceType) considerando Sexo (feature Sex):
cross = pd.crosstab(police2["Sex"], police2["ForceType"], normalize="index")
plot = cross.plot(kind='bar', stacked=True,title='Violência por Sexo')
plt.ylabel('Freq.Relativa (em%)')  

 

#Para legenda ficar fora do gráfico
ax = plt.subplot(111)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

**Análise qualitativa da feature Precinct e a target ForceType**

In [None]:
#Analisando a frequência do tipo de violência (target ForceType) considerando Distrito (feature Precinct):
cross = pd.crosstab(police2["Precinct"], police2["ForceType"], normalize="index")
plot = cross.plot(kind='bar', stacked=True,title='Violência por distrito')
plt.ylabel('Freq.Relativa (em%)')  

 

#Para legenda ficar fora do gráfico
ax = plt.subplot(111)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

**Análise qualitativa da feature SubjectInjury e a target ForceType**

In [None]:
#Analisando a frequência do tipo de violência (target ForceType) considerando SubjectInjury (feature SubjectInjury):
cross = pd.crosstab(police2["SubjectInjury"], police2["ForceType"], normalize="index")
plot = cross.plot(kind='bar', stacked=True,title='Tipo de violência considerando se já havia lesão ou não')
plt.ylabel('Freq.Relativa (em%)')  

 

#Para legenda ficar fora do gráfico
ax = plt.subplot(111)
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

#### Análise de Features Quantitativas 

In [None]:
for ft in lista_ft:
    
    plt.figure(figsize=(20, 10))
    
    df_ = police2.loc[police2['ForceType'] == ft, :]
    print('----------------------------------------------')
    print(ft)
    
    plt.subplot(331)
    plt.hist(df_.EventAge, edgecolor='white', density=True)
    plt.ylabel('densidade')
    plt.title('EventAge for {}'.format(ft))
    print(f'Média Idade: {df_.EventAge.mean()} Variância Idade: {df_.EventAge.var()}')
    
    plt.subplot(333)
    plt.hist(df_.TotalCityCallsForYear, edgecolor='white', density=True)
    plt.ylabel('densidade')
    plt.title('TotalCityCallsForYear for {0}'.format(ft))
    print(f'Média City Calls: {df_.TotalCityCallsForYear.mean()} Variância City Calls: {df_.TotalCityCallsForYear.var()}')
    
    plt.subplot(337)
    plt.hist(df_.TotalPrecinctCallsForYear, edgecolor='white', density=True)
    plt.ylabel('densidade')
    plt.title('TotalPrecinctCallsForYear for {}'.format(ft))
    print(f'Média Precint Calls: {df_.TotalPrecinctCallsForYear.mean()} Variância Precint Calls: {df_.TotalPrecinctCallsForYear.var()}')
    
    plt.subplot(339)
    plt.hist(df_.TotalNeighborhoodCallsForYear, edgecolor='white', density=True)
    plt.ylabel('densidade')
    plt.title('TotalNeighborhoodCallsForYear for {}'.format(ft))
    print(f'Média Neighbourhood Calls: {df_.TotalNeighborhoodCallsForYear.mean()} Variância Neighbourhood Calls: {df_.TotalNeighborhoodCallsForYear.var()}')

    plt.show()

# Pré processamento

In [None]:
police3 = pd.get_dummies(police2, prefix=['Prob', '911', 'PriOff', 'SubInj', 'Race', 'Sex', 'TyOfRes', 'Precinct', 'Nbh'], columns=['Problem', 'Is911Call', 'PrimaryOffense', 'SubjectInjury', 'Race', 'Sex', 'TypeOfResistance', 'Precinct', 'Neighborhood'])
police3

In [None]:
X = police3.iloc[:, 1:].values
y = police3.iloc[:, 0].values
print(y)
print(X)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
print(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

In [None]:
print(X_train)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 0:4] = sc.fit_transform(X_train[:, 0:4])
X_test[:, 0:4] = sc.transform(X_test[:, 0:4])

# Testando modelo de Regressão Logística

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)
array_logistic = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)
df_logistic = pd.DataFrame(array_logistic, columns=['Prediction', 'Test'])
df_logistic

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))
plot_confusion_matrix(classifier.fit(X_train, y_train), X_test, y_test, 
                      normalize='true', display_labels=['0', '1'], 
                      cmap=plt.cm.Blues, ax=ax, values_format='.2%')

# Testando modelo Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
modelo_tree = classifier.fit(X_train, y_train)
# classifier.feature_importance (Barbara
# https://mljar.com/blog/visualize-decision-tree/

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
#print(cm)
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))
plot_confusion_matrix(modelo_tree, X_test, y_test, 
                      normalize='true', display_labels=['0', '1'], 
                      cmap=plt.cm.Oranges, ax=ax, values_format='.2%')

# Testando Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
modelo_random = classifier.fit(X_train, y_train)
modelo_random

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
#print(cm)
accuracy_score(y_test, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(7, 6))
plot_confusion_matrix(modelo_random, X_test, y_test, 
                      normalize='true', display_labels=['0', '1'], 
                      cmap=plt.cm.Purples, ax=ax, values_format='.2%')

## Tentando melhorar o desempenho 

Trocaremos os ForceType que não são Bodily Force por uma categoria só

In [None]:
filtra_linhas = police3['ForceType'] != 'Bodily Force'
filtra_linhas2 = police3['ForceType'] == 'Bodily Force'
df_auxiliar = police3.loc[filtra_linhas, :]
df_auxiliar2 = police3.loc[filtra_linhas2, :]

df_auxiliar['ForceType'] = 'Weapon'

police4 = pd.concat([df_auxiliar, df_auxiliar2]).sort_index()
police4.head(10)

In [None]:
X = police4.iloc[:, 1:].values
y = police4.iloc[:, 0].values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:, 0:4] = sc.fit_transform(X_train[:, 0:4])
X_test[:, 0:4] = sc.transform(X_test[:, 0:4])

 #### Tentando Logistico mais uma vez

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
array_logistic = np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)
df_logistic = pd.DataFrame(array_logistic, columns=['Prediction', 'Test'])
df_logistic

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

#### Tentando Decision Tree de novo

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
#print(cm)
accuracy_score(y_test, y_pred)

##### Tentando Random Forest de novo

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
#print(cm)
accuracy_score(y_test, y_pred)

## Testando o cálculo de feature importance - sugestão barbara

### Com todos os ForceType

In [None]:
X = police3.iloc[:, 1:].values
y = police3.iloc[:, 0].values

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
print(y)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, 0:4] = sc.fit_transform(X[:, 0:4])

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X, y)

feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance))

In [None]:
# Fazendo um data frame com o nome das suas colunas e suas respectivas importâncias
df1 = pd.DataFrame()
df1['feature_importance'] = feat_importance
df1['nome'] = police3.columns[1:]
df1.sort_values(by = 'feature_importance', ascending = False).head(13)

In [None]:
filtra_linhas = df1['feature_importance'] > 0.003 
df2 = df1.loc[filtra_linhas, :]

features_list = []
for feat in df2['nome']:
    features_list.append(feat)
    
features_list.append('ForceType')

police5 = police3[features_list].copy()
police5.head()

##### Agora vamos tentar ver se melhora

In [None]:
X = police5.iloc[:, :-1].values
y = police5.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
#print(cm)
accuracy_score(y_test, y_pred)

### Com apenas 2 ForceType

In [None]:
X = police4.iloc[:, 1:].values
y = police4.iloc[:, 0].values

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
print(y)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X[:, 0:4] = sc.fit_transform(X[:, 0:4])

In [None]:
clf = DecisionTreeClassifier()
clf.fit(X, y)

feat_importance = clf.tree_.compute_feature_importances(normalize=False)
print("feat importance = " + str(feat_importance))

In [None]:
# Fazendo um novo data frame com o nome das colunas e suas respectivas importâncias
df3= pd.DataFrame()
df3['feature_importance'] = feat_importance
df3['nome'] = police4.columns[1:]
df3.sort_values(by = 'feature_importance', ascending = False)

In [None]:
filtra_linhas = df3['feature_importance'] > 0.003 
df4 = df3.loc[filtra_linhas, :]

features_list2 = []
for feat in df3['nome']:
    features_list2.append(feat)
    
features_list2.append('ForceType')

police6 = police4[features_list].copy()
police6.head()

In [None]:
X = police6.iloc[:, :-1].values
y = police6.iloc[:, -1].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
#print(cm)
accuracy_score(y_test, y_pred)

In [None]:
# Quando ajeitar o problema que tá dando, eu coloco mais mapas de calor e coloco as legendas que precisa ter, por enquanto tá bem estranho!!