In [1]:
import pandas as pd
import numpy as np

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', None)

## Manipular dados de entrada

### 1. Dados de crimes

In [3]:
# carregar dados
data = pd.read_csv('data/chicagoCrimes10k.csv.bz2', sep=',', encoding='UTF-8') #, index_col='ID')

# embaralhar dados
data = data.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data.shape)

# mostrar exemplos de dados
#data.head()

(9999, 22)


In [4]:
# remover colunas desnecessárias
data.drop(['X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Location'], axis=1, inplace=True)
#data.head()

In [5]:
# modificar nomes das colunas
cols = {}
for col in data.columns:
    newcol = col.replace(' ', '_')
    cols[col] = newcol
cols['Date'] = 'Date_Time'
data.rename(columns=cols, inplace=True)
#data.head()

In [6]:
# alterar valores dos booleanos
for col in ['Arrest', 'Domestic']:
    data[col] = data[col].map({True: 'YES', False: 'NO'})

In [7]:
# alterar os IDs dos registros
data['ID'] -= 21689

In [8]:
# definir campo chave
data.set_index('ID', inplace=True)

In [9]:
data.head()

Unnamed: 0_level_0,Case_Number,Date_Time,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9923063,HY133182,01/29/2015 04:53:14 PM,029XX E 87TH ST,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,STREET,YES,NO,423,4.0,10,46,18,41.7372,-87.5531
9927630,HY138122,01/29/2015 09:13:39 PM,044XX S PULASKI RD,1150,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,DEPARTMENT STORE,NO,NO,821,8.0,14,57,11,41.8124,-87.7236
9920228,HY130226,01/26/2015 10:30:42 AM,013XX W BELMONT AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,PARKING LOT/GARAGE(NON.RESID.),NO,NO,1933,19.0,32,6,14,41.9397,-87.6625
9932413,HY142247,02/06/2015 05:40:54 PM,020XX N LAWLER AVE,560,ASSAULT,SIMPLE,APARTMENT,NO,NO,2522,25.0,31,19,08A,41.9176,-87.752
9917884,HY128631,01/25/2015 08:30:38 PM,021XX N MAJOR AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,NO,NO,2515,25.0,29,19,26,41.9185,-87.7681


In [10]:
# definir colunas como categóricas
for cols in ['Block', 'IUCR', 'Primary_Type', 'Description', 'Location_Description',
             'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code']:
    data[cols] = data[cols].astype('category')

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9999 entries, 9923063 to 9932108
Data columns (total 16 columns):
Case_Number             9999 non-null object
Date_Time               9999 non-null object
Block                   9999 non-null category
IUCR                    9999 non-null category
Primary_Type            9999 non-null category
Description             9999 non-null category
Location_Description    9993 non-null category
Arrest                  9999 non-null category
Domestic                9999 non-null category
Beat                    9999 non-null category
District                9837 non-null category
Ward                    9999 non-null category
Community_Area          9999 non-null category
FBI_Code                9999 non-null category
Latitude                9837 non-null float64
Longitude               9837 non-null float64
dtypes: category(12), float64(2), object(2)
memory usage: 718.9+ KB


In [12]:
data.describe()

Unnamed: 0,Latitude,Longitude
count,9837.0,9837.0
mean,41.8426,-87.6741
std,0.086,0.06
min,41.6451,-87.9065
25%,41.7685,-87.7195
50%,41.8587,-87.6701
75%,41.9056,-87.6293
max,42.0226,-87.5248


In [13]:
data.describe(include=['O'])

Unnamed: 0,Case_Number,Date_Time
count,9999,9999
unique,9999,9572
top,HY140645,01/23/2015 11:00:40 PM
freq,1,3


In [14]:
data.describe(include=['category'])

Unnamed: 0,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code
count,9999,9999,9999,9999,9993,9999,9999,9999,9837.0,9999,9999,9999
unique,6518,213,27,199,91,2,2,274,22.0,50,77,25
top,001XX N STATE ST,820,THEFT,SIMPLE,STREET,NO,NO,511,11.0,28,25,6
freq,34,933,2034,1112,2488,7071,8476,101,758.0,533,699,2034


### Dados de temperatura

In [15]:
# carregar dados
data2 = pd.read_csv('data/chicagoAllWeather.csv.bz2', sep=',', index_col='date')

# embaralhar dados
data2 = data2.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data2.shape)

# mostrar exemplos de dados
data2.head()

(5162, 6)


Unnamed: 0_level_0,month,day,year,maxTemp,meanTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5/6/13,5,6,2013,71.0,61.0,50.0
12/14/07,12,14,2007,28.0,23.0,18.0
10/26/14,10,26,2014,64.0,53.0,42.0
1/18/01,1,18,2001,30.0,24.0,19.0
11/2/04,11,2,2004,48.0,47.0,45.0


In [16]:
# remover colunas desnecessárias
data2.drop(['month', 'day', 'year'], axis=1, inplace=True)
data2.head()

Unnamed: 0_level_0,maxTemp,meanTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5/6/13,71.0,61.0,50.0
12/14/07,28.0,23.0,18.0
10/26/14,64.0,53.0,42.0
1/18/01,30.0,24.0,19.0
11/2/04,48.0,47.0,45.0


In [17]:
data2.head()

Unnamed: 0_level_0,maxTemp,meanTemp,minTemp
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5/6/13,71.0,61.0,50.0
12/14/07,28.0,23.0,18.0
10/26/14,64.0,53.0,42.0
1/18/01,30.0,24.0,19.0
11/2/04,48.0,47.0,45.0


In [18]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5162 entries, 5/6/13 to 12/12/09
Data columns (total 3 columns):
maxTemp     5149 non-null float64
meanTemp    5149 non-null float64
minTemp     5149 non-null float64
dtypes: float64(3)
memory usage: 141.1+ KB


In [19]:
data2.describe()

Unnamed: 0,maxTemp,meanTemp,minTemp
count,5149.0,5149.0,5149.0
mean,58.871,50.3104,41.4813
std,21.483,19.9302,19.0207
min,-2.0,-9.0,-18.0
25%,41.0,34.0,28.0
50%,61.0,51.0,42.0
75%,78.0,68.0,57.0
max,103.0,93.0,82.0


### Dados de censo

In [20]:
# carregar dados
data3 = pd.read_csv('data/chicagoCensus.csv.bz2', sep=',', encoding='UTF-8') #, index_col='ID')

# embaralhar dados
data3 = data3.sample(frac=1)

# mostrar quantidade de linhas e colunas
print(data3.shape)

# mostrar exemplos de dados
data3.head()

(78, 9)


Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1.0,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
67,68.0,Englewood,3.8,46.6,28.0,28.5,42.5,11888,94.0
63,64.0,Clearing,2.7,8.9,9.5,18.8,37.6,25113,29.0
50,51.0,South Deering,4.0,29.2,16.3,21.0,39.5,14685,65.0
12,13.0,North Park,3.9,13.2,9.9,14.4,39.0,26576,33.0


In [21]:
# inserir valores faltantes
data3.loc[data3['Community Area Number'].isnull(), 'Community Area Number'] = 0
data3.fillna(0, inplace=True)

In [22]:
# alterar tipo de dados das colunas
for cols in ['Community Area Number', 'HARDSHIP INDEX']:
    data3[cols] = data3[cols].astype('int64')

In [23]:
# definir campo chave
data3.set_index('Community Area Number', inplace=True)

In [24]:
data3.tail()

Unnamed: 0_level_0,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
Community Area Number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,CHICAGO,4.7,19.7,12.9,19.5,33.5,28202,0
48,Calumet Heights,2.1,11.5,20.0,11.0,44.0,28887,38
42,Woodlawn,2.9,30.7,23.4,16.5,36.1,18672,58
76,O'Hare,3.6,15.4,7.1,10.9,30.3,25828,24
69,Greater Grand Crossing,3.6,29.6,23.0,16.5,41.0,17285,66


In [25]:
data3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 78 entries, 1 to 69
Data columns (total 8 columns):
COMMUNITY AREA NAME                             78 non-null object
PERCENT OF HOUSING CROWDED                      78 non-null float64
PERCENT HOUSEHOLDS BELOW POVERTY                78 non-null float64
PERCENT AGED 16+ UNEMPLOYED                     78 non-null float64
PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA    78 non-null float64
PERCENT AGED UNDER 18 OR OVER 64                78 non-null float64
PER CAPITA INCOME                               78 non-null int64
HARDSHIP INDEX                                  78 non-null int64
dtypes: float64(5), int64(2), object(1)
memory usage: 5.2+ KB


## Gerar dados de treino e de teste

In [26]:
# dividir os dados de treino e teste
divisao = int(data.shape[0] * 0.85)
train = data[:divisao]
test = data[divisao:]
print(train.shape, test.shape)

(8499, 16) (1500, 16)


In [27]:
train.head()

Unnamed: 0_level_0,Case_Number,Date_Time,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9923063,HY133182,01/29/2015 04:53:14 PM,029XX E 87TH ST,1811,NARCOTICS,POSS: CANNABIS 30GMS OR LESS,STREET,YES,NO,423,4.0,10,46,18,41.7372,-87.5531
9927630,HY138122,01/29/2015 09:13:39 PM,044XX S PULASKI RD,1150,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,DEPARTMENT STORE,NO,NO,821,8.0,14,57,11,41.8124,-87.7236
9920228,HY130226,01/26/2015 10:30:42 AM,013XX W BELMONT AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,PARKING LOT/GARAGE(NON.RESID.),NO,NO,1933,19.0,32,6,14,41.9397,-87.6625
9932413,HY142247,02/06/2015 05:40:54 PM,020XX N LAWLER AVE,560,ASSAULT,SIMPLE,APARTMENT,NO,NO,2522,25.0,31,19,08A,41.9176,-87.752
9917884,HY128631,01/25/2015 08:30:38 PM,021XX N MAJOR AVE,2826,OTHER OFFENSE,HARASSMENT BY ELECTRONIC MEANS,RESIDENCE,NO,NO,2515,25.0,29,19,26,41.9185,-87.7681


In [28]:
test.head()

Unnamed: 0_level_0,Case_Number,Date_Time,Block,IUCR,Primary_Type,Description,Location_Description,Arrest,Domestic,Beat,District,Ward,Community_Area,FBI_Code,Latitude,Longitude
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9918529,HY129084,01/25/2015 07:15:45 PM,0000X W MONROE ST,870,THEFT,POCKET-PICKING,SIDEWALK,NO,NO,112,1.0,42,32,06,41.8808,-87.6284
9925380,HY134467,01/30/2015 08:30:10 AM,063XX W 64TH PL,620,BURGLARY,UNLAWFUL ENTRY,APARTMENT,NO,NO,812,8.0,13,64,05,41.7749,-87.78
9928362,HY138953,02/04/2015 01:49:53 AM,054XX S HYDE PARK BLVD,820,THEFT,$500 AND UNDER,VEHICLE NON-COMMERCIAL,NO,YES,234,2.0,5,41,06,41.7955,-87.5838
9922396,HY132485,01/29/2015 06:15:42 AM,003XX W ADAMS ST,560,ASSAULT,SIMPLE,CONVENIENCE STORE,NO,NO,122,1.0,2,32,08A,41.8795,-87.6358
9931676,HY141605,02/05/2015 10:30:40 PM,030XX W WABANSIA AVE,1320,CRIMINAL DAMAGE,TO VEHICLE,STREET,NO,NO,1421,14.0,26,23,14,41.9121,-87.7033


## Gerar dados de exemplo

In [29]:
example = test[['Arrest']]
example['Arrest'] = example.index.map(lambda x: 'NO' if (x % 3 == 0) else 'YES')
example.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,Arrest
ID,Unnamed: 1_level_1
9918529,YES
9925380,NO
9928362,NO
9922396,YES
9931676,YES


In [30]:
example.describe()

Unnamed: 0,Arrest
count,1500
unique,2
top,YES
freq,1030


## Gerar arquivos CSV

In [31]:
# definir parâmetros
file_prefix = 'chicago'
target_column = 'Arrest'

In [32]:
# gerar arquivos CSV
train.to_csv(file_prefix + '-train.csv')
test.drop([target_column], axis=1).to_csv(file_prefix + '-test.csv')
test[[target_column]].to_csv(file_prefix + '-solution.csv')
example.to_csv(file_prefix + '-example.csv')

In [33]:
# gerar arquivos adicionais (de apoio)
data2.to_csv(file_prefix + '-weather.csv')
data3.to_csv(file_prefix + '-census.csv')

In [34]:
!head chicago-*.csv

==> chicago-census.csv <==
Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME ,HARDSHIP INDEX
1,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39
68,Englewood,3.8,46.6,28.0,28.5,42.5,11888,94
64,Clearing,2.7,8.9,9.5,18.8,37.6,25113,29
51,South Deering,4.0,29.2,16.3,21.0,39.5,14685,65
13,North Park,3.9,13.2,9.9,14.4,39.0,26576,33
62,West Elsdon,11.1,15.6,16.7,37.0,37.7,15754,69
43,South Shore,2.8,31.1,20.0,14.0,35.7,19398,55
5,North Center,0.3,7.5,5.2,4.5,26.2,57123,6
77,Edgewater,4.1,18.2,9.2,9.7,23.8,33385,19

==> chicago-example.csv <==
ID,Arrest
9918529,YES
9925380,NO
9928362,NO
9922396,YES
9931676,YES
9925698,NO
9916909,YES
9932852,YES
9932021,YES

==> chicago-solution.csv <==
ID,Arrest
9918529,NO
9925380,NO
9928362,NO
9922396,NO
9931676,NO
9925698,YES
9916909,NO
9932852,YES
9932021