# Start Python and Check Versions

In [1]:
# Check the versions of libraries

# Python version
import sys
print('Python: {}'.format(sys.version))
# numpy
import numpy 
print('numpy: {}'.format(numpy.__version__))
# pandas
import pandas
print('pandas: {}'.format(pandas.__version__))

Python: 3.7.4 (default, Aug  9 2019, 18:34:13) [MSC v.1915 64 bit (AMD64)]
numpy: 1.16.5
pandas: 0.25.1


# Load The Data

## Import libraries

In [2]:
# Load libraries
import pandas as pd
import matplotlib.pyplot as plt
import os

##  Load RAW Dataset

In [3]:
# Load dataset   
file = os.path.join('..', 'Resources','Crimes_2001_to_present.csv')
names = ['Date', 'Primary Type', 'Latitude', 'Longitude']
dataset = pd.read_csv(file)    
dataset = dataset.loc[:, names]
dataset = dataset[dataset['Primary Type'] != 'NON - CRIMINAL' ]
dataset = dataset[dataset['Primary Type'] != 'NON-CRIMINAL' ] 
dataset = dataset[dataset['Primary Type'] != 'NON-CRIMINAL (SUBJECT SPECIFIED)' ]
dataset = dataset.dropna(axis=0, how='any')
dataset['Date'] = pd.to_datetime(dataset['Date']).dt.date
dataset['Date'] = pd.to_datetime(dataset.Date,format='%Y-%m-%d %H:%M')

In [4]:
dataset = dataset.dropna(how='any')
print(dataset.shape)
dataset.head()

(6811610, 4)


Unnamed: 0,Date,Primary Type,Latitude,Longitude
71,2019-06-19,HOMICIDE,41.763714,-87.691782
349,2019-06-19,HOMICIDE,41.711437,-87.639309
1348,2019-06-18,HOMICIDE,41.776019,-87.61552
2064,2019-06-17,HOMICIDE,41.935855,-87.644258
2138,2019-06-16,HOMICIDE,41.75248,-87.569903


## Save dataframe to file

In [5]:
file = os.path.join('..', 'Resources', 'Crimes_2001_2019.csv')
dataset.to_csv(file, encoding='utf-8', index=False)

 # Load Down Sized Dataset

In [6]:
file = os.path.join('..', 'Resources','Crimes_2001_2019.csv')
dataset = pd.read_csv(file)

## Peek at the Data

In [7]:
# head
dataset.head(5)

Unnamed: 0,Date,Primary Type,Latitude,Longitude
0,2019-06-19,HOMICIDE,41.763714,-87.691782
1,2019-06-19,HOMICIDE,41.711437,-87.639309
2,2019-06-18,HOMICIDE,41.776019,-87.61552
3,2019-06-17,HOMICIDE,41.935855,-87.644258
4,2019-06-16,HOMICIDE,41.75248,-87.569903


In [8]:
# shape and describe
print(dataset.shape)

(6811610, 4)


## Statistical Summary

In [9]:
# descriptions
print(dataset.describe())

           Latitude     Longitude
count  6.811610e+06  6.811610e+06
mean   4.184203e+01 -8.767178e+01
std    8.994493e-02  6.208642e-02
min    3.661945e+01 -9.168657e+01
25%    4.176891e+01 -8.771385e+01
50%    4.185551e+01 -8.766615e+01
75%    4.190682e+01 -8.762835e+01
max    4.202291e+01 -8.752453e+01


## Distribution

In [10]:
# distribution
print(dataset.groupby('Primary Type').size())

Primary Type
ARSON                                  11258
ASSAULT                               426624
BATTERY                              1250565
BURGLARY                              391369
CONCEALED CARRY LICENSE VIOLATION        372
CRIM SEXUAL ASSAULT                    26526
CRIMINAL DAMAGE                       780562
CRIMINAL TRESPASS                     195821
DECEPTIVE PRACTICE                    262080
DOMESTIC VIOLENCE                          1
GAMBLING                               14352
HOMICIDE                                9740
HUMAN TRAFFICKING                         52
INTERFERENCE WITH PUBLIC OFFICER       15745
INTIMIDATION                            3995
KIDNAPPING                              6706
LIQUOR LAW VIOLATION                   14013
MOTOR VEHICLE THEFT                   315862
NARCOTICS                             708649
OBSCENITY                                585
OFFENSE INVOLVING CHILDREN             44259
OTHER NARCOTIC VIOLATION                 1

# DATA EXTRACTION

In [11]:
options = ['BATTERY', 'CRIMINAL DAMAGE', 'NARCOTICS', 'THEFT', 'OTHER OFFENSE', 'BURGLARY']   
# selecting rows based on condition 
df = dataset[dataset['Primary Type'].isin(options)] 

for c in options:
    df.insert(2 + options.index(c), c, 0.0)
    
i = 0  
for c in df['Primary Type']:
    df.iat[i, df.columns.get_loc(c)] = 1.0
    i += 1

df.head()    

Unnamed: 0,Date,Primary Type,BATTERY,CRIMINAL DAMAGE,NARCOTICS,THEFT,OTHER OFFENSE,BURGLARY,Latitude,Longitude
56,2019-05-21,OTHER OFFENSE,0.0,0.0,0.0,0.0,1.0,0.0,41.69511,-87.671597
58,2019-05-21,THEFT,0.0,0.0,0.0,1.0,0.0,0.0,41.88973,-87.73426
59,2019-05-21,NARCOTICS,0.0,0.0,1.0,0.0,0.0,0.0,41.750941,-87.625185
60,2019-05-21,BATTERY,1.0,0.0,0.0,0.0,0.0,0.0,41.668009,-87.625926
62,2019-05-21,CRIMINAL DAMAGE,0.0,1.0,0.0,0.0,0.0,0.0,41.750466,-87.622981


In [12]:
if 'Primary Type' in df:
    del df['Primary Type']
df.columns = [ 'Date', 'battery', 'damage', 'narcotics', 'theft', 'other', 'burglary', 'Latitude', 'Longitude']
df.head() 

Unnamed: 0,Date,battery,damage,narcotics,theft,other,burglary,Latitude,Longitude
56,2019-05-21,0.0,0.0,0.0,0.0,1.0,0.0,41.69511,-87.671597
58,2019-05-21,0.0,0.0,0.0,1.0,0.0,0.0,41.88973,-87.73426
59,2019-05-21,0.0,0.0,1.0,0.0,0.0,0.0,41.750941,-87.625185
60,2019-05-21,1.0,0.0,0.0,0.0,0.0,0.0,41.668009,-87.625926
62,2019-05-21,0.0,1.0,0.0,0.0,0.0,0.0,41.750466,-87.622981


In [13]:
df.battery = df['battery'].astype(numpy.int64)
df.damage = df['damage'].astype(numpy.int64)
df.narcotics = df['narcotics'].astype(numpy.int64)
df.theft = df['theft'].astype(numpy.int64)
df.other = df['other'].astype(numpy.int64)
print(df.dtypes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Date          object
battery        int64
damage         int64
narcotics      int64
theft          int64
other          int64
burglary     float64
Latitude     float64
Longitude    float64
dtype: object


In [14]:
# DATE TIME STAMP FUNCTION
column_1 = pd.to_datetime(pd.Series(df.iloc[:,0]))
#print(type(column_1))
db=pd.DataFrame({"year": column_1.dt.year,
              "month": column_1.dt.month,
              "day": column_1.dt.day,
              "hour": column_1.dt.hour,
              "dayofyear": column_1.dt.dayofyear,
              "week": column_1.dt.week,
              "weekofyear": column_1.dt.weekofyear,
              "dayofweek": column_1.dt.dayofweek,
              "weekday": column_1.dt.weekday,
              "quarter": column_1.dt.quarter,
             })

In [15]:
if 'Date' in df:
    del df['Date']
df=pd.concat([db,df],axis=1)
df = df.loc[:,~df.columns.duplicated()] 
df = df.loc[df['year']==2018, :]
df.head(10000)

Unnamed: 0,year,month,day,hour,dayofyear,week,weekofyear,dayofweek,weekday,quarter,battery,damage,narcotics,theft,other,burglary,Latitude,Longitude
90647,2018,12,31,0,365,1,1,0,0,4,0,1,0,0,0,0.0,41.689079,-87.696064
90649,2018,12,31,0,365,1,1,0,0,4,0,1,0,0,0,0.0,41.740521,-87.647391
90650,2018,12,31,0,365,1,1,0,0,4,1,0,0,0,0,0.0,41.857068,-87.657625
90651,2018,12,31,0,365,1,1,0,0,4,1,0,0,0,0,0.0,41.751914,-87.647717
90652,2018,12,31,0,365,1,1,0,0,4,1,0,0,0,0,0.0,41.875684,-87.760479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104989,2018,12,10,0,344,50,50,0,0,4,0,0,0,0,0,1.0,41.883334,-87.760227
104990,2018,12,10,0,344,50,50,0,0,4,0,0,0,1,0,0.0,41.939651,-87.644386
104992,2018,12,10,0,344,50,50,0,0,4,0,0,0,1,0,0.0,41.899410,-87.624131
104993,2018,12,10,0,344,50,50,0,0,4,0,0,0,1,0,0.0,41.945234,-87.664488


In [16]:
from random import sample
rindex =  numpy.array(sample(range(len(df)), 2500))
df = df.iloc[rindex, :]
df.head()

Unnamed: 0,year,month,day,hour,dayofyear,week,weekofyear,dayofweek,weekday,quarter,battery,damage,narcotics,theft,other,burglary,Latitude,Longitude
163149,2018,9,18,0,261,38,38,1,1,3,0,1,0,0,0,0.0,41.878891,-87.723899
130269,2018,11,2,0,306,44,44,4,4,4,0,0,1,0,0,0.0,41.789842,-87.643822
242452,2018,6,10,0,161,23,23,6,6,2,1,0,0,0,0,0.0,41.939792,-87.662159
130557,2018,11,2,0,306,44,44,4,4,4,0,0,0,1,0,0.0,41.810838,-87.712491
309394,2018,3,11,0,70,10,10,6,6,1,1,0,0,0,0,0.0,41.75109,-87.554095


In [17]:
print(df.dtypes)
df.year = df['year'].astype(float)
df.month = df['month'].astype(float)
df.day = df['day'].astype(float)
df.day = df['hour'].astype(float)
df.dayofyear = df['dayofyear'].astype(float)
df.dayofyear = df['week'].astype(float)
df.weekofyear = df['weekofyear'].astype(float)
df.dayofweek = df['dayofweek'].astype(float)
df.dayofweek = df['weekday'].astype(float)
df.quarter = df['quarter'].astype(float)
print(df.dtypes)

year            int64
month           int64
day             int64
hour            int64
dayofyear       int64
week            int64
weekofyear      int64
dayofweek       int64
weekday         int64
quarter         int64
battery         int64
damage          int64
narcotics       int64
theft           int64
other           int64
burglary      float64
Latitude      float64
Longitude     float64
dtype: object
year          float64
month         float64
day           float64
hour            int64
dayofyear     float64
week            int64
weekofyear    float64
dayofweek     float64
weekday         int64
quarter       float64
battery         int64
damage          int64
narcotics       int64
theft           int64
other           int64
burglary      float64
Latitude      float64
Longitude     float64
dtype: object


In [18]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,year,month,day,hour,dayofyear,week,weekofyear,dayofweek,weekday,quarter,battery,damage,narcotics,theft,other,burglary,Latitude,Longitude
0,2018.0,9.0,0.0,0,38.0,38,38.0,1.0,1,3.0,0,1,0,0,0,0.0,41.878891,-87.723899
1,2018.0,11.0,0.0,0,44.0,44,44.0,4.0,4,4.0,0,0,1,0,0,0.0,41.789842,-87.643822
2,2018.0,6.0,0.0,0,23.0,23,23.0,6.0,6,2.0,1,0,0,0,0,0.0,41.939792,-87.662159
3,2018.0,11.0,0.0,0,44.0,44,44.0,4.0,4,4.0,0,0,0,1,0,0.0,41.810838,-87.712491
4,2018.0,3.0,0.0,0,10.0,10,10.0,6.0,6,1.0,1,0,0,0,0,0.0,41.75109,-87.554095


# Save selected dataframe to file

In [19]:
file = os.path.join('..', 'Resources','Crimes_selected.csv')
df.to_csv(file, encoding='utf-8', index=False)