# Crime prediction in St Louis City

In [0]:
import pandas as pd
import datetime
import numpy as np
from datetime import datetime

# **Preprocessing data**

In the available dataset, we are interested in the crime type, location (latitude and longitude), and date and time.

---



In [14]:
#importing dataset
crime=pd.read_csv('stl-crime-data_2008-2015.tsv',delimiter='\t',encoding='utf-8')
#keeping wanted columns
crime = crime[["UCRCrime","DateOccured","Latitude","Longitude"]]

crime['DateOccured'] = [datetime.strptime(crime.iloc[i]['DateOccured'], '%m/%d/%Y %H:%M') for i in range(len(crime.index))]
crime

Unnamed: 0,UCRCrime,DateOccured,Latitude,Longitude
0,Sex Offenses,2000-01-01 20:58:00,38.658845,-90.260116
1,Larceny-theft,2003-01-01 00:01:00,38.576172,-90.233989
2,Fraud,2004-01-01 00:01:00,38.671754,-90.233544
3,Fraud,2006-01-01 10:00:00,38.583504,-90.230145
4,Other assaults,2007-01-01 16:36:00,38.659580,-90.236606
...,...,...,...,...
477636,Larceny-theft,2013-12-01 08:00:00,38.580835,-90.267600
477637,Forcible Rape,2013-12-20 08:00:00,35.801506,-93.265663
477638,Other assaults,2014-12-22 16:00:00,38.599216,-90.271246
477639,Robbery,2014-12-26 21:20:00,35.801506,-93.265663


Replacing the date and time column by 8 columns of separate datetime attributes.



In [15]:

crime['Year'] = [(crime.iloc[i]['DateOccured']).year for i in range(len(crime.index))]
crime['Month'] = [(crime.iloc[i]['DateOccured']).month for i in range(len(crime.index))]
crime['Day'] = [(crime.iloc[i]['DateOccured']).day for i in range(len(crime.index))]
crime['Hour'] = [(crime.iloc[i]['DateOccured']).hour for i in range(len(crime.index))]
crime['DayOfYear'] = [(crime.iloc[i]['DateOccured']).dayofyear for i in range(len(crime.index))]
crime['Week'] = [(crime.iloc[i]['DateOccured']).week for i in range(len(crime.index))]
crime['DayOfWeek'] = [(crime.iloc[i]['DateOccured']).dayofweek for i in range(len(crime.index))]
crime['Quarter'] = [(crime.iloc[i]['DateOccured']).quarter for i in range(len(crime.index))]


crime = crime.drop(columns='DateOccured')

crime

Unnamed: 0,UCRCrime,Latitude,Longitude,Year,Month,Day,Hour,DayOfYear,Week,DayOfWeek,Quarter
0,Sex Offenses,38.658845,-90.260116,2000,1,1,20,1,52,5,1
1,Larceny-theft,38.576172,-90.233989,2003,1,1,0,1,1,2,1
2,Fraud,38.671754,-90.233544,2004,1,1,0,1,1,3,1
3,Fraud,38.583504,-90.230145,2006,1,1,10,1,52,6,1
4,Other assaults,38.659580,-90.236606,2007,1,1,16,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
477636,Larceny-theft,38.580835,-90.267600,2013,12,1,8,335,48,6,4
477637,Forcible Rape,35.801506,-93.265663,2013,12,20,8,354,51,4,4
477638,Other assaults,38.599216,-90.271246,2014,12,22,16,356,52,0,4
477639,Robbery,35.801506,-93.265663,2014,12,26,21,360,52,4,4


Defining crime categories which represent the output of the model.

In [16]:
types  = crime['UCRCrime'].unique()
types

array(['Sex Offenses', 'Larceny-theft', 'Fraud', 'Other assaults',
       'Forcible Rape', 'Aggravated Assault', 'Other',
       'Motor vehicle theft', 'Forgery and Counterfeiting', 'Burglary',
       'Vandalism', 'Disorderly Conduct', 'Arson', 'Robbery',
       'Drug Abuse Violations', 'DUI', 'Embezzlement', 'Weapons',
       'Liquor Laws', 'Vagrancy', 'Criminal Homicide', 'Stolen Property',
       'Offenses Against the Family and Children',
       'Prostitution and Commercialized Vice', 'Gambling', 'Drunkenness'],
      dtype=object)

In [17]:
#categorizing crimes
violence = ['Forcible Rape','Aggravated Assault', 'Sex Offenses', 'Other assaults','Weapons',
           'Offenses Against the Family and Children']
murder = ['Criminal Homicide']
substance = ['Drug Abuse Violations','Liquor Laws','Drunkenness', 'DUI', 'Disorderly Conduct']
properties = ['Larceny-theft', 'Motor vehicle theft', 'Forgery and Counterfeiting',
               'Burglary', 'Arson','Robbery', 'Stolen Property', 'Embezzlement','Fraud']
vandalism = ['Vandalism']
vagrancy = ['Vagrancy', 'Prostitution and Commercialized Vice']
gambling = ['Gambling']
other = ['Other']
categories = [violence, murder, substance, properties, vandalism, vagrancy, gambling, other]
#if no types are missing, add a column for each category with value 1 if crime type matches category, 0 else
i = 1
if types.shape[0] == sum([len(cat) for cat in categories]):
  for cat in categories :
    crime['cat_'+str(i)] = [ sum([float(crime.iloc[j]['UCRCrime'] == e) for e in cat]) for j in range(len(crime.index))]
    print('Done with category '+str(i))
    i = i+1

Done with category 1
Done with category 2
Done with category 3
Done with category 4
Done with category 5
Done with category 6
Done with category 7
Done with category 8


In [0]:
crime = crime.drop(columns='UCRCrime')

In [21]:
crime

Unnamed: 0,Latitude,Longitude,Year,Month,Day,Hour,DayOfYear,Week,DayOfWeek,Quarter,cat_1,cat_2,cat_3,cat_4,cat_5,cat_6,cat_7,cat_8
0,38.658845,-90.260116,2000,1,1,20,1,52,5,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,38.576172,-90.233989,2003,1,1,0,1,1,2,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,38.671754,-90.233544,2004,1,1,0,1,1,3,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,38.583504,-90.230145,2006,1,1,10,1,52,6,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,38.659580,-90.236606,2007,1,1,16,1,1,0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
477636,38.580835,-90.267600,2013,12,1,8,335,48,6,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
477637,35.801506,-93.265663,2013,12,20,8,354,51,4,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
477638,38.599216,-90.271246,2014,12,22,16,356,52,0,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
477639,35.801506,-93.265663,2014,12,26,21,360,52,4,4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [0]:
#Exporting clean attributes
attributes = crime[['Latitude', 'Longitude','Year','Month','Day','Hour','DayOfYear','Week','DayOfWeek','Quarter']].values
# save to npy file
np.save('clean_attributes.npy', attributes)

In [0]:
#Exporting clean labels
labels = crime[['cat_1', 'cat_2','cat_3','cat_4','cat_5','cat_6','cat_7','cat_8']].values
# save to npy file
np.save('clean_labels.npy', labels)