In [1]:
import pandas as pd
import numpy as np

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

## Manipular dados de entrada

### 1. Dados de crimes

In [3]:
# carregar dados
data = pd.read_csv('data/chicagoCrimes10k.csv.bz2', sep=',', encoding='UTF-8') #, index_col='ID')

# embaralhar dados
data = data.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data.shape)

# mostrar exemplos de dados
#data.head()

(9999, 22)


In [4]:
# remover colunas desnecessárias
data.drop(['X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Location'], axis=1, inplace=True)
#data.head()

In [5]:
# modificar nomes das colunas
cols = {}
for col in data.columns:
    newcol = col.replace(' ', '_')
    cols[col] = newcol
cols['Date'] = 'Date_Time'
data.rename(columns=cols, inplace=True)
#data.head()

In [6]:
# alterar valores dos booleanos
for col in ['Arrest', 'Domestic']:
    data[col] = data[col].map({True: 'YES', False: 'NO'})

In [7]:
# alterar os IDs dos registros
data['ID'] -= 21689

In [8]:
# definir campo chave
data.set_index('ID', inplace=True)

In [9]:
data.head()

Unnamed: 0_level_0,Case_Number,Date_Time,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9922073,HY132119,01/28/2015 06:29:34 PM,008XX W 115TH ST,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,STREET,YES,NO,524,5.0,34,53,18,41.685,-87.6435
9928179,HY138758,02/03/2015 01:17:57 PM,061XX N WOLCOTT AVE,820,THEFT,$500 AND UNDER,"SCHOOL, PUBLIC, BUILDING",NO,NO,2413,24.0,40,2,06,41.9929,-87.6773
9915870,HY125922,01/23/2015 03:18:30 PM,014XX S LUMBER ST,1350,CRIMINAL TRESPASS,TO STATE SUP LAND,OTHER RAILROAD PROP / TRAIN DEPOT,YES,NO,124,1.0,2,28,26,41.8636,-87.6352
9920228,HY130226,01/26/2015 10:30:42 AM,013XX W BELMONT AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,PARKING LOT/GARAGE(NON.RESID.),NO,NO,1933,19.0,32,6,14,41.9397,-87.6625
9920269,HY130368,01/27/2015 07:55:30 AM,028XX W 79TH ST,560,ASSAULT,SIMPLE,STREET,NO,NO,835,8.0,18,70,08A,41.7501,-87.694


In [10]:
# definir colunas como categóricas
for cols in ['Block', 'IUCR', 'Primary_Type', 'Description', 'Location_Description',
             'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code']:
    data[cols] = data[cols].astype('category')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 9922073 to 9928804
Data columns (total 16 columns):
Case_Number             9999 non-null object
Date_Time               9999 non-null object
Block                   9999 non-null category
IUCR                    9999 non-null category
Primary_Type            9999 non-null category
Description             9999 non-null category
Location_Description    9993 non-null category
Arrest                  9999 non-null category
Domestic                9999 non-null category
Beat                    9999 non-null category
District                9837 non-null category
Ward                    9999 non-null category
Community_Area          9999 non-null category
FBI_Code                9999 non-null category
Latitude                9837 non-null float64
Longitude               9837 non-null float64
dtypes: category(12), float64(2), object(2)
memory usage: 718.9+ KB


In [12]:
data.describe()

Unnamed: 0,Latitude,Longitude
count,9837.0,9837.0
mean,41.8426,-87.6741
std,0.086,0.06
min,41.6451,-87.9065
25%,41.7685,-87.7195
50%,41.8587,-87.6701
75%,41.9056,-87.6293
max,42.0226,-87.5248


In [13]:
data.describe(include=['O'])

Unnamed: 0,Case_Number,Date_Time
count,9999,9999
unique,9999,9572
top,HY139542,01/24/2015 06:00:17 PM
freq,1,3


In [14]:
data.describe(include=['category'])

Unnamed: 0,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code
count,9999,9999,9999,9999,9993,9999,9999,9999,9837.0,9999,9999,9999
unique,6518,213,27,199,91,2,2,274,22.0,50,77,25
top,001XX N STATE ST,820,THEFT,SIMPLE,STREET,NO,NO,511,11.0,28,25,6
freq,34,933,2034,1112,2488,7071,8476,101,758.0,533,699,2034


### Dados de temperatura

In [15]:
# carregar dados
data2 = pd.read_csv('data/chicagoAllWeather.csv.bz2', sep=',', index_col='date')

# embaralhar dados
data2 = data2.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data2.shape)

# mostrar exemplos de dados
data2.head()

(5162, 6)


Unnamed: 0_level_0,month,day,year,maxTemp,meanTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4/10/10,4,10,2010,74.0,56.0,38.0
2/25/01,2,25,2001,55.0,42.0,28.0
5/15/11,5,15,2011,46.0,44.0,41.0
9/29/14,9,29,2014,83.0,68.0,53.0
12/28/04,12,28,2004,41.0,32.0,24.0


In [16]:
# remover colunas desnecessárias
data2.drop(['month', 'day', 'year'], axis=1, inplace=True)
data2.head()

Unnamed: 0_level_0,maxTemp,meanTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4/10/10,74.0,56.0,38.0
2/25/01,55.0,42.0,28.0
5/15/11,46.0,44.0,41.0
9/29/14,83.0,68.0,53.0
12/28/04,41.0,32.0,24.0


In [17]:
data2.head()

Unnamed: 0_level_0,maxTemp,meanTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4/10/10,74.0,56.0,38.0
2/25/01,55.0,42.0,28.0
5/15/11,46.0,44.0,41.0
9/29/14,83.0,68.0,53.0
12/28/04,41.0,32.0,24.0


In [18]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5162 entries, 4/10/10 to 7/26/12
Data columns (total 3 columns):
maxTemp     5149 non-null float64
meanTemp    5149 non-null float64
minTemp     5149 non-null float64
dtypes: float64(3)
memory usage: 141.1+ KB


In [19]:
data2.describe()

Unnamed: 0,maxTemp,meanTemp,minTemp
count,5149.0,5149.0,5149.0
mean,58.871,50.3104,41.4813
std,21.483,19.9302,19.0207
min,-2.0,-9.0,-18.0
25%,41.0,34.0,28.0
50%,61.0,51.0,42.0
75%,78.0,68.0,57.0
max,103.0,93.0,82.0


### Dados de censo

In [20]:
# carregar dados
data3 = pd.read_csv('data/chicagoCensus.csv.bz2', sep=',', encoding='UTF-8') #, index_col='ID')

# embaralhar dados
data3 = data3.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data3.shape)

# mostrar exemplos de dados
data3.head()

(78, 9)


Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
6,7.0,Lincoln Park,0.8,12.3,5.1,3.6,21.5,71551,2.0
18,19.0,Belmont Cragin,10.8,18.7,14.6,37.3,37.3,15461,70.0
11,12.0,Forest Glen,1.1,7.5,6.8,4.9,40.5,44164,11.0
69,70.0,Ashburn,4.0,10.4,11.7,17.7,36.9,23482,37.0
9,10.0,Norwood Park,2.0,5.4,9.0,11.5,39.5,32875,21.0


In [21]:
# inserir valores faltantes
data3.loc[data3['Community Area Number'].isnull(), 'Community Area Number'] = 0
data3.fillna(0, inplace=True)

In [22]:
# alterar tipo de dados das colunas
for cols in ['Community Area Number', 'HARDSHIP INDEX']:
    data3[cols] = data3[cols].astype('int64')

In [23]:
# definir campo chave
data3.set_index('Community Area Number', inplace=True)

In [24]:
data3.tail()

Unnamed: 0_level_0,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
Community Area Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
37,Fuller Park,3.2,51.2,33.9,26.6,44.9,10432,97
5,North Center,0.3,7.5,5.2,4.5,26.2,57123,6
46,South Chicago,4.7,29.8,19.7,26.6,41.1,16579,75
72,Beverly,0.9,5.1,8.0,3.7,40.5,39523,12
42,Woodlawn,2.9,30.7,23.4,16.5,36.1,18672,58


In [25]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 7 to 42
Data columns (total 8 columns):
COMMUNITY AREA NAME                             78 non-null object
PERCENT OF HOUSING CROWDED                      78 non-null float64
PERCENT HOUSEHOLDS BELOW POVERTY                78 non-null float64
PERCENT AGED 16+ UNEMPLOYED                     78 non-null float64
PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA    78 non-null float64
PERCENT AGED UNDER 18 OR OVER 64                78 non-null float64
PER CAPITA INCOME                               78 non-null int64
HARDSHIP INDEX                                  78 non-null int64
dtypes: float64(5), int64(2), object(1)
memory usage: 5.2+ KB


## Gerar dados de treino e de teste

In [26]:
# dividir os dados de treino e teste
divisao = int(data.shape[0] * 0.85)
train = data[:divisao]
test = data[divisao:]
print(train.shape, test.shape)

(8499, 16) (1500, 16)


In [27]:
train.head()

Unnamed: 0_level_0,Case_Number,Date_Time,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9922073,HY132119,01/28/2015 06:29:34 PM,008XX W 115TH ST,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,STREET,YES,NO,524,5.0,34,53,18,41.685,-87.6435
9928179,HY138758,02/03/2015 01:17:57 PM,061XX N WOLCOTT AVE,820,THEFT,$500 AND UNDER,"SCHOOL, PUBLIC, BUILDING",NO,NO,2413,24.0,40,2,06,41.9929,-87.6773
9915870,HY125922,01/23/2015 03:18:30 PM,014XX S LUMBER ST,1350,CRIMINAL TRESPASS,TO STATE SUP LAND,OTHER RAILROAD PROP / TRAIN DEPOT,YES,NO,124,1.0,2,28,26,41.8636,-87.6352
9920228,HY130226,01/26/2015 10:30:42 AM,013XX W BELMONT AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,PARKING LOT/GARAGE(NON.RESID.),NO,NO,1933,19.0,32,6,14,41.9397,-87.6625
9920269,HY130368,01/27/2015 07:55:30 AM,028XX W 79TH ST,560,ASSAULT,SIMPLE,STREET,NO,NO,835,8.0,18,70,08A,41.7501,-87.694


In [28]:
test.head()

Unnamed: 0_level_0,Case_Number,Date_Time,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9934020,HY144513,02/08/2015 05:30:47 PM,071XX S EAST END AVE,486,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENTIAL YARD (FRONT/BACK),NO,YES,324,3.0,8,43,08B,41.7652,-87.5838
9933471,HY143859,02/08/2015 01:55:35 AM,010XX N DEARBORN ST,460,BATTERY,SIMPLE,CONVENIENCE STORE,NO,NO,1824,18.0,42,8,08B,41.9017,-87.6301
9932646,HY142651,02/07/2015 02:00:21 AM,017XX S STATE ST,1210,DECEPTIVE PRACTICE,THEFT OF LABOR/SERVICES,POLICE FACILITY/VEH PARKING LOT,YES,NO,131,1.0,3,33,11,41.8584,-87.6274
9930137,HY140334,02/05/2015 08:46:52 AM,035XX W 71ST ST,910,MOTOR VEHICLE THEFT,AUTOMOBILE,RESIDENTIAL YARD (FRONT/BACK),NO,NO,831,8.0,18,66,07,41.7642,-87.7105
9921675,HY131654,01/27/2015 10:00:18 PM,062XX S WHIPPLE ST,820,THEFT,$500 AND UNDER,STREET,NO,NO,823,8.0,15,66,06,41.7798,-87.6994


## Gerar dados de exemplo

In [29]:
example = test[['Arrest']]
example['Arrest'] = example.index.map(lambda x: 'NO' if (x % 3 == 0) else 'YES')
example.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Arrest
ID,Unnamed: 1_level_1
9934020,NO
9933471,NO
9932646,NO
9930137,YES
9921675,NO


In [30]:
example.describe()

Unnamed: 0,Arrest
count,1500
unique,2
top,YES
freq,1001


## Gerar arquivos CSV

In [31]:
# gerar arquivos CSV
train.to_csv('chicago-train.csv')
test.drop(['Arrest'], axis=1).to_csv('chicago-test.csv')
test[['Arrest']].to_csv('chicago-solution.csv')
example.to_csv('chicago-example.csv')

In [32]:
# gerar arquivos adicionais (de apoio)
data2.to_csv('chicago-weather.csv')
data3.to_csv('chicago-census.csv')

In [33]:
!head chicago-*.csv

==> chicago-census.csv <==
Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME ,HARDSHIP INDEX
7,Lincoln Park,0.8,12.3,5.1,3.6,21.5,71551,2
19,Belmont Cragin,10.8,18.7,14.6,37.3,37.3,15461,70
12,Forest Glen,1.1,7.5,6.8,4.9,40.5,44164,11
70,Ashburn,4.0,10.4,11.7,17.7,36.9,23482,37
10,Norwood Park,2.0,5.4,9.0,11.5,39.5,32875,21
40,Washington Park,5.6,42.1,28.6,25.4,42.8,13785,88
58,Brighton Park,14.4,23.6,13.9,45.1,39.3,13089,84
74,Mount Greenwood,1.0,3.4,8.7,4.3,36.8,34381,16
45,Avalon Park,1.4,17.2,21.1,10.6,39.3,24454,41

==> chicago-example.csv <==
ID,Arrest
9934020,NO
9933471,NO
9932646,NO
9930137,YES
9921675,NO
9916441,YES
9927469,YES
9931231,YES
9928574,YES

==> chicago-solution.csv <==
ID,Arrest
9934020,NO
9933471,NO
9932646,YES
9930137,NO
9921675,NO
9916441,NO
9927469,NO
9931