## 2012.1.1 - 2017.9.2 Baltimore Victim-Based Crime exploratory data analysis

In [1]:
# Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [2]:
# read in kaggle 2012-2017 dataset
df = pd.read_csv("Resources/2012-2017_BPD_Victim_Based_Crime_Data.csv")
df.head()

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,District,Neighborhood,Longitude,Latitude,Location 1,Premise,Total Incidents
0,09/02/2017,23:30:00,3JK,4200 AUDREY AVE,ROBBERY - RESIDENCE,I,KNIFE,913.0,SOUTHERN,Brooklyn,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",ROW/TOWNHO,1
1,09/02/2017,23:00:00,7A,800 NEWINGTON AVE,AUTO THEFT,O,,133.0,CENTRAL,Reservoir Hill,-76.63217,39.3136,"(39.3136000000, -76.6321700000)",STREET,1
2,09/02/2017,22:53:00,9S,600 RADNOR AV,SHOOTING,Outside,FIREARM,524.0,NORTHERN,Winston-Govans,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",Street,1
3,09/02/2017,22:50:00,4C,1800 RAMSAY ST,AGG. ASSAULT,I,OTHER,934.0,SOUTHERN,Carrollton Ridge,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",ROW/TOWNHO,1
4,09/02/2017,22:31:00,4E,100 LIGHT ST,COMMON ASSAULT,O,HANDS,113.0,CENTRAL,Downtown West,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",STREET,1


In [3]:
df.describe()
# look at age in 2021, which means something
# here, it's useless
# what does post mean?

Unnamed: 0,Post,Longitude,Latitude,Total Incidents
count,276305.0,274325.0,274325.0,276529.0
mean,506.256821,-76.617444,39.307399,1.0
std,260.73537,0.042306,0.029556,0.0
min,2.0,-76.71162,39.20041,1.0
25%,243.0,-76.64846,39.28833,1.0
50%,511.0,-76.61396,39.30364,1.0
75%,731.0,-76.58744,39.32781,1.0
max,945.0,-76.5285,39.37293,1.0


## Weapon types

In [4]:
# weapon types
print(
    f"""The number of weapon types is {df.Weapon.nunique()}

{df.Weapon.unique()}

{df.Weapon.value_counts()}
"""
)

The number of weapon types is 4

['KNIFE' nan 'FIREARM' 'OTHER' 'HANDS']

HANDS      48995
FIREARM    22312
OTHER      14620
KNIFE       9650
Name: Weapon, dtype: int64



## Inside / Outside

Creates dataframe that merges "Outside" and "Inside" with "O" and "I", then drops rows with NA values in df["Inside/Outside"]

In [5]:
# indoor / outdoor
in_out_df = df.copy()
in_out_df["Inside/Outside"] = df["Inside/Outside"].replace(
    {"Outside": "O", "Inside": "I"}
)

in_out_df.dropna(subset=["Inside/Outside"], inplace=True)

print(
    f"""The original data's value counts:
-------
{df['Inside/Outside'].value_counts()}

in_out_df:
-------
{in_out_df['Inside/Outside'].value_counts()}
"""
)

The original data's value counts:
-------
I          131999
O          129782
Outside      3837
Inside        632
Name: Inside/Outside, dtype: int64

in_out_df:
-------
O    133619
I    132631
Name: Inside/Outside, dtype: int64



## Time Series

In [6]:
time_df = df.copy()

# convert day to datetime data type
time_df.CrimeDate = pd.to_datetime(df.CrimeDate)

# supposedly converted time to time object data type
time_df.loc[time_df.CrimeTime == "24:00:00", "CrimeTime"] = "00:00:00"
time_df["CrimeTimeTemp"] = pd.to_datetime(time_df.CrimeTime, format="%H:%M:%S")

# create bins and labels for morning, afternoon, evening, late night
bins = [-1, 6, 12, 18, 24]
labels = ["Late Night", "Morning", "Afternoon", "Evening"]

# pd.cut likes ints, so dt.hour extracts the hour for comparison with bins
# this took forever
# why is my computer so mean to me?
time_df["Time of Day"] = pd.cut(time_df.CrimeTimeTemp.dt.hour, bins=bins, labels=labels)
time_df.drop(columns=["CrimeTimeTemp"], inplace=True)

time_df

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,District,Neighborhood,Longitude,Latitude,Location 1,Premise,Total Incidents,Time of Day
0,2017-09-02,23:30:00,3JK,4200 AUDREY AVE,ROBBERY - RESIDENCE,I,KNIFE,913.0,SOUTHERN,Brooklyn,-76.60541,39.22951,"(39.2295100000, -76.6054100000)",ROW/TOWNHO,1,Evening
1,2017-09-02,23:00:00,7A,800 NEWINGTON AVE,AUTO THEFT,O,,133.0,CENTRAL,Reservoir Hill,-76.63217,39.31360,"(39.3136000000, -76.6321700000)",STREET,1,Evening
2,2017-09-02,22:53:00,9S,600 RADNOR AV,SHOOTING,Outside,FIREARM,524.0,NORTHERN,Winston-Govans,-76.60697,39.34768,"(39.3476800000, -76.6069700000)",Street,1,Evening
3,2017-09-02,22:50:00,4C,1800 RAMSAY ST,AGG. ASSAULT,I,OTHER,934.0,SOUTHERN,Carrollton Ridge,-76.64526,39.28315,"(39.2831500000, -76.6452600000)",ROW/TOWNHO,1,Evening
4,2017-09-02,22:31:00,4E,100 LIGHT ST,COMMON ASSAULT,O,HANDS,113.0,CENTRAL,Downtown West,-76.61365,39.28756,"(39.2875600000, -76.6136500000)",STREET,1,Evening
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276524,2012-01-01,00:00:00,6J,1400 JOH AVE,LARCENY,I,,832.0,SOUTHWESTERN,Violetville,-76.67195,39.26132,"(39.2613200000, -76.6719500000)",OTHER - IN,1,Late Night
276525,2012-01-01,00:00:00,6J,5500 SINCLAIR LN,LARCENY,O,,444.0,NORTHEASTERN,Frankford,-76.53829,39.32493,"(39.3249300000, -76.5382900000)",OTHER - OU,1,Late Night
276526,2012-01-01,00:00:00,6E,400 N PATTERSON PK AV,LARCENY,O,,321.0,EASTERN,CARE,-76.58497,39.29573,"(39.2957300000, -76.5849700000)",STREET,1,Late Night
276527,2012-01-01,00:00:00,5A,5800 LILLYAN AV,BURGLARY,I,,425.0,NORTHEASTERN,Glenham-Belhar,-76.54578,39.34701,"(39.3470100000, -76.5457800000)",APT. LOCKE,1,Late Night


In [7]:
time_df["Time of Day"].value_counts()

Afternoon     89215
Evening       74286
Morning       62452
Late Night    50576
Name: Time of Day, dtype: int64

## Crime Description

In [8]:
desc_df = df.copy()

print(
    f"""The original value counts:
-----
{desc_df.Description.value_counts()}      
      
"""
)

# Do we wanna clean that?

The original value counts:
-----
LARCENY                 60528
COMMON ASSAULT          45518
BURGLARY                42538
LARCENY FROM AUTO       36295
AGG. ASSAULT            27513
AUTO THEFT              26838
ROBBERY - STREET        17691
ROBBERY - COMMERCIAL     4141
ASSAULT BY THREAT        3503
SHOOTING                 2910
ROBBERY - RESIDENCE      2866
RAPE                     1637
HOMICIDE                 1559
ROBBERY - CARJACKING     1528
ARSON                    1464
Name: Description, dtype: int64      
      



## Premises Type

In [9]:
building_df = df.copy()

print(
    f"""The original value counts:
-----
{building_df.Premise.value_counts()}      
      
"""
)

# gotta drop some of these
# save to csv to see the entire 123 values before dropping
tmp = building_df.Premise.value_counts()
tmp.to_csv("premises_types.csv")

The original value counts:
-----
STREET        100919
ROW/TOWNHO     60502
PARKING LO     12043
APT/CONDO      12002
OTHER - IN     11459
               ...  
Private Sc         2
Day Care F         1
Church             1
MANUFACTUR         1
Gas Statio         1
Name: Premise, Length: 123, dtype: int64      
      



## Total Incidents

In [10]:
incidents_df = df.copy()

print(
    f"""The original value counts:
-----
{incidents_df["Total Incidents"].value_counts()}      
      
"""
)

# check to see if there are multiple incidents at the same address and time
# which clearly happens in the 2012-2021 df
# or... we just ignore it

The original value counts:
-----
1    276529
Name: Total Incidents, dtype: int64      
      



## Location

In [11]:
loc_df = df.copy()

print(
    f"""The original value counts per district:
-----
{loc_df.District.value_counts()}      
      
The original value counts per neighborhood:
-----
{loc_df.Neighborhood.value_counts()}
"""
)

The original value counts per district:
-----
NORTHEASTERN    43006
SOUTHEASTERN    38291
SOUTHERN        31850
NORTHERN        31665
CENTRAL         31631
NORTHWESTERN    28061
SOUTHWESTERN    26092
EASTERN         23635
WESTERN         22218
Name: District, dtype: int64      
      
The original value counts per neighborhood:
-----
Downtown                   9048
Frankford                  6642
Belair-Edison              5977
Brooklyn                   4516
Cherry Hill                4086
                           ... 
Villages Of Homeland         37
Greenmount Cemetery          23
Mt Pleasant Park             13
Blythewood                    4
Dundalk Marine Terminal       1
Name: Neighborhood, Length: 278, dtype: int64

