## 2012.1.1 - 2021.4.30 Baltimore Victim-Based Crime exploratory data analysis

In [1]:
# Dependencies
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [2]:
# read in cleaned open baltimore 2012-2021 dataset
df = pd.read_csv("Resources/2012-2021_BPD_Victim_Based_Crime_Data_clean.csv", low_memory=False)
df.head()

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,Gender,Age,Race,Ethnicity,District,Neighborhood,Latitude,Longitude,GeoLocation,Premise,Total_Incidents,CrimeDateTime
0,2021-04-30,23:50:00+00,6D,200 SCOTT ST,LARCENY FROM AUTO,,,932.0,M,22.0,UNKNOWN,,SOUTHERN,WASHINGTON VILLAGE/PIGTOWN,39.285056,-76.629022,"(39.285056,-76.629022)",,1,2021/04/30 23:50:00+00
1,2021-04-30,23:50:00+00,6G,1700 THAMES ST,LARCENY,I,,213.0,F,29.0,WHITE,,SOUTHEAST,FELLS POINT,39.281896,-76.592512,"(39.281896,-76.592512)",BAR,1,2021/04/30 23:50:00+00
2,2021-04-30,23:38:00+00,6E,4100 EMMART AVE,LARCENY,O,,631.0,F,52.0,BLACK_OR_AFRICAN_AMERICAN,,NORTHWEST,REISTERSTOWN STATION,39.349471,-76.693679,"(39.349471,-76.693679)",PARKING LOT-OUTSIDE,1,2021/04/30 23:38:00+00
3,2021-04-30,23:38:00+00,6E,4100 EMMART AVE,LARCENY,O,,631.0,M,26.0,UNKNOWN,,NORTHWEST,REISTERSTOWN STATION,39.349471,-76.693679,"(39.349471,-76.693679)",PARKING LOT-OUTSIDE,1,2021/04/30 23:38:00+00
4,2021-04-30,23:38:00+00,6E,4100 EMMART AVE,LARCENY,O,,631.0,,,UNKNOWN,,NORTHWEST,REISTERSTOWN STATION,39.349471,-76.693679,"(39.349471,-76.693679)",PARKING LOT-OUTSIDE,1,2021/04/30 23:38:00+00


In [3]:
df.describe()
# look at age - i'm surprised it's that high
# wonder how that breaks down by type of crime committed

# what does post mean?

Unnamed: 0,Post,Age,Latitude,Longitude,Total_Incidents
count,473711.0,381852.0,480542.0,480542.0,481408.0
mean,503.587244,37.926534,38.773137,-75.576983,1.0
std,261.031554,38.989416,4.549539,8.867924,0.0
min,111.0,-7979.0,0.0,-76.71128,1.0
25%,234.0,26.0,39.287699,-76.648362,1.0
50%,511.0,35.0,39.302473,-76.613526,1.0
75%,724.0,49.0,39.326625,-76.585595,1.0
max,943.0,8251.0,39.372088,0.0,1.0


# Create a number of pandas data frames

they clean up data from a particular column, and drop null or marginal values, for ease of comparing one value with many others


## Weapon types

In [4]:
# weapon types
# cleaning: drop the single digit items?
print(
    f"""The number of weapon types is {df.Weapon.nunique()}

{df.Weapon.unique()}

{df.Weapon.value_counts()}
"""
)

The number of weapon types is 10

[nan 'FIREARM' 'OTHER' 'FIRE' 'KNIFE' 'HANDS' 'PERSONAL_WEAPONS'
 'KNIFE_CUTTING_INSTRUMENT' 'BLUNT_OBJECT' 'HANDGUN' 'UNKNOWN']

FIREARM                     42435
OTHER                       28118
KNIFE                       16995
HANDS                        6865
FIRE                         2233
PERSONAL_WEAPONS               35
BLUNT_OBJECT                    2
KNIFE_CUTTING_INSTRUMENT        1
HANDGUN                         1
UNKNOWN                         1
Name: Weapon, dtype: int64



## Gender

In [5]:
# gender
gender_df = df.copy()

# move 'male' and 'female' into 'm' and 'f'
gender_df.Gender = df.Gender.replace({"Female": "F", "Male": "M"})

# drop all other genders
gender_df = gender_df[gender_df.Gender.isin({"M", "F", "U"})]

print(
f"""The original data's value counts:
-------
{df.Gender.value_counts()}

gender_df's value counts:
-------
{gender_df.Gender.value_counts()}
"""
)

The original data's value counts:
-------
F             199493
M             181765
U              11528
Male            6722
Female           815
B                130
W                 20
N                 11
,                 10
Transgende         7
FB                 6
FW                 2
160                2
O                  2
120                1
165                1
60                 1
8                  1
MB                 1
77                 1
17                 1
042819             1
A                  1
S                  1
FM                 1
T                  1
M\                 1
FF                 1
FU                 1
50                 1
Name: Gender, dtype: int64

gender_df's value counts:
-------
F    200308
M    188487
U     11528
Name: Gender, dtype: int64



## Inside / Outside

Creates dataframe that merges "Outside" and "Inside" with "O" and "I", then drops rows with NA values in df["Inside/Outside"]

In [6]:
# indoor / outdoor
in_out_df = df.copy()

#
in_out_df["Inside/Outside"] = df["Inside/Outside"].replace(
    {"Outside": "O", "Inside": "I"}
)

in_out_df.dropna(subset=["Inside/Outside"], inplace=True)

print(
f"""Original Indoor/Outdoor value counts:
-------
{df['Inside/Outside'].value_counts()}

New Indoor/Outdoor value counts:
-------
{in_out_df['Inside/Outside'].value_counts()}
"""
)

Original Indoor/Outdoor value counts:
-------
I          215730
O          211354
Outside      6689
Inside        955
Name: Inside/Outside, dtype: int64

New Indoor/Outdoor value counts:
-------
O    218043
I    216685
Name: Inside/Outside, dtype: int64



## Time Series

In [12]:
time_df = df.copy()

# convert day to datetime64 data type
time_df.CrimeDate = pd.to_datetime(df.CrimeDate)

# !!!need to strip +00 from CrimeTime!!!
# supposedly converts CrimeTime to time object data type
# time_df.loc[time_df.CrimeTime == "24:00:00", "CrimeTime"] = "00:00:00"
# time_df.CrimeTime = pd.to_datetime(time_df.CrimeTime, format='%H:%M:%S').dt.time

# backup plan:
# converts CrimeDateTime object datetime64 data type
time_df.CrimeDateTime = pd.to_datetime(df.CrimeDateTime)
time_df.CrimeTime = time_df.CrimeDateTime.dt.time

# !!!!! Need to create bins and series for morning, afternoon, evening, late night

time_df = time_df[time_df.CrimeDate >= "2012-01-01"]
time_df

Unnamed: 0,CrimeDate,CrimeTime,CrimeCode,Location,Description,Inside/Outside,Weapon,Post,Gender,Age,Race,Ethnicity,District,Neighborhood,Latitude,Longitude,GeoLocation,Premise,Total_Incidents,CrimeDateTime
0,2021-04-30,23:50:00,6D,200 SCOTT ST,LARCENY FROM AUTO,,,932.0,M,22.0,UNKNOWN,,SOUTHERN,WASHINGTON VILLAGE/PIGTOWN,39.285056,-76.629022,"(39.285056,-76.629022)",,1,2021-04-30 23:50:00+00:00
1,2021-04-30,23:50:00,6G,1700 THAMES ST,LARCENY,I,,213.0,F,29.0,WHITE,,SOUTHEAST,FELLS POINT,39.281896,-76.592512,"(39.281896,-76.592512)",BAR,1,2021-04-30 23:50:00+00:00
2,2021-04-30,23:38:00,6E,4100 EMMART AVE,LARCENY,O,,631.0,F,52.0,BLACK_OR_AFRICAN_AMERICAN,,NORTHWEST,REISTERSTOWN STATION,39.349471,-76.693679,"(39.349471,-76.693679)",PARKING LOT-OUTSIDE,1,2021-04-30 23:38:00+00:00
3,2021-04-30,23:38:00,6E,4100 EMMART AVE,LARCENY,O,,631.0,M,26.0,UNKNOWN,,NORTHWEST,REISTERSTOWN STATION,39.349471,-76.693679,"(39.349471,-76.693679)",PARKING LOT-OUTSIDE,1,2021-04-30 23:38:00+00:00
4,2021-04-30,23:38:00,6E,4100 EMMART AVE,LARCENY,O,,631.0,,,UNKNOWN,,NORTHWEST,REISTERSTOWN STATION,39.349471,-76.693679,"(39.349471,-76.693679)",PARKING LOT-OUTSIDE,1,2021-04-30 23:38:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
432277,2012-01-01,00:00:00,6J,1400 JOH AVE,LARCENY,I,,832.0,,,UNKNOWN,,SOUTHWEST,VIOLETVILLE,39.261314,-76.672007,"(39.261314007121257,-76.672006671865333)",OTHER - INSIDE,1,2012-01-01 00:00:00+00:00
432278,2012-01-01,00:00:00,7C,1600 CLIFTVIEW AVE,AUTO THEFT,I,,312.0,F,44.0,BLACK_OR_AFRICAN_AMERICAN,,EASTERN,DARLEY PARK,39.315297,-76.594681,"(39.315297,-76.594681)",ROW/TOWNHOUSE-OCC,1,2012-01-01 00:00:00+00:00
432279,2012-01-01,00:00:00,5C,2500 ALBION AVE,BURGLARY,I,,421.0,M,71.0,WHITE,,NORTHEAST,LAURAVILLE,39.348835,-76.572889,"(39.348835002786856,-76.572888624594796)",ROW/TOWNHOUSE-OCC,1,2012-01-01 00:00:00+00:00
432280,2012-01-01,00:00:00,6J,5500 SINCLAIR LN,LARCENY,O,,444.0,,,UNKNOWN,,NORTHEAST,FRANKFORD,39.324944,-76.538325,"(39.324944,-76.538325)",OTHER - OUTSIDE,1,2012-01-01 00:00:00+00:00


In [13]:
time_df.dtypes

CrimeDate               datetime64[ns]
CrimeTime                       object
CrimeCode                       object
Location                        object
Description                     object
Inside/Outside                  object
Weapon                          object
Post                           float64
Gender                          object
Age                            float64
Race                            object
Ethnicity                       object
District                        object
Neighborhood                    object
Latitude                       float64
Longitude                      float64
GeoLocation                     object
Premise                         object
Total_Incidents                  int64
CrimeDateTime      datetime64[ns, UTC]
dtype: object

## Crime Description

In [17]:
desc_df = df.copy()

print(f"""The original value counts:
-----
{desc_df.Description.value_counts()}      
      
""")

# Do we wanna clean that?

The original value counts:
-----
LARCENY                 107578
COMMON ASSAULT           84629
BURGLARY                 70857
LARCENY FROM AUTO        64354
AGG. ASSAULT             52841
AUTO THEFT               40508
ROBBERY                  38940
ROBBERY - COMMERCIAL      4974
SHOOTING                  4788
ROBBERY - CARJACKING      3625
RAPE                      3325
HOMICIDE                  2756
ARSON                     2233
Name: Description, dtype: int64      
      



## Premises Type

In [16]:
building_df = df.copy()

print(f"""The original value counts:
-----
{building_df.Premise.value_counts()}      
      
""")

# gotta drop some of these
# save to csv to see the entire 123 values before dropping

The original value counts:
-----
STREET                  160897
ROW/TOWNHOUSE-OCC        97144
APT/CONDO - OCCUPIED     20921
OTHER - INSIDE           17495
PARKING LOT-OUTSIDE      17437
                         ...  
INNER HARBOR-S-KEY           1
DOCTORS OFFICE               1
SCHOOL PLAYGROUND            1
RENTAL/VIDEO STORE           1
FIRE DEPARTMENT              1
Name: Premise, Length: 161, dtype: int64      
      



## Total Incidents

In [20]:
incidents_df = df.copy()

print(f"""The original value counts:
-----
{incidents_df["Total_Incidents"].value_counts()}      
      
""")

# This means nothing. See rows 2, 3, and 4 of the df.head() at the top...
# ...three separate incidents at the same time and address

The original value counts:
-----
1    481408
Name: Total_Incidents, dtype: int64      
      



## Location

In [21]:
loc_df = df.copy()

print(f"""The original value counts per district:
-----
{loc_df.District.value_counts()}      
      
The original value counts per neighborhood:
-----
{loc_df.Neighborhood.value_counts()}
""")

The original value counts per district:
-----
NORTHEAST    72582
SOUTHEAST    65811
CENTRAL      55663
SOUTHERN     53648
NORTHERN     51615
NORTHWEST    47736
SOUTHWEST    47091
EASTERN      41464
WESTERN      38101
Name: District, dtype: int64      
      
The original value counts per neighborhood:
-----
DOWNTOWN                   16533
FRANKFORD                  11274
BELAIR-EDISON              10288
BROOKLYN                    8429
CANTON                      6958
                           ...  
VILLAGES OF HOMELAND          49
GREENMOUNT CEMETERY           34
MT PLEASANT PARK              31
BLYTHEWOOD                    12
DUNDALK MARINE TERMINAL        1
Name: Neighborhood, Length: 278, dtype: int64

