In [1]:
import pandas as pd
import numpy as np

In [80]:
# Import, clean, and filter data as necessary 
df = pd.read_csv('data/chicago_crime_2019_2023.csv')
df.dropna(subset=['community_area'], inplace=True)
df['community_area'] = df['community_area'].astype(int)

In [65]:
df['primary_type'].unique()

array(['CRIMINAL SEXUAL ASSAULT', 'ROBBERY', 'PUBLIC PEACE VIOLATION',
       'WEAPONS VIOLATION', 'PROSTITUTION', 'SEX OFFENSE',
       'OFFENSE INVOLVING CHILDREN', 'NARCOTICS',
       'INTERFERENCE WITH PUBLIC OFFICER', 'OTHER OFFENSE', 'HOMICIDE',
       'BATTERY', 'ASSAULT', 'MOTOR VEHICLE THEFT', 'DECEPTIVE PRACTICE',
       'CRIMINAL TRESPASS', 'THEFT', 'ARSON', 'INTIMIDATION', 'STALKING',
       'CRIMINAL DAMAGE', 'KIDNAPPING', 'BURGLARY',
       'LIQUOR LAW VIOLATION', 'CONCEALED CARRY LICENSE VIOLATION',
       'CRIM SEXUAL ASSAULT', 'OTHER NARCOTIC VIOLATION', 'OBSCENITY',
       'HUMAN TRAFFICKING', 'GAMBLING', 'NON-CRIMINAL',
       'PUBLIC INDECENCY', 'RITUALISM'], dtype=object)

In [81]:
# convert from utc to datetime and remove time zone effect
df['date'] = pd.to_datetime(df['date'], utc=True)
df['date'] = df['date'].dt.tz_localize(None)
df.head()

Unnamed: 0,unique_key,date,block,primary_type,description,ward,community_area
0,12465794,2021-08-27 10:00:00,104XX S MICHIGAN AVE,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER,9.0,49
1,12788122,2022-08-08 11:42:00,007XX E 103RD ST,ROBBERY,AGGRAVATED VEHICULAR HIJACKING,9.0,50
2,12502168,2021-10-04 08:10:00,0000X W 103RD ST,ROBBERY,ATTEMPT ARMED - HANDGUN,34.0,49
3,13001633,2023-03-05 05:15:00,009XX E 104TH ST,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,9.0,50
4,11830976,2019-09-16 11:53:00,009XX E 104TH ST,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,9.0,50


In [82]:
# adding csv for community area numbers
comm_areas = pd.read_csv('data/CommAreas.csv')

# merging community name to df
df1 = pd.merge(df, comm_areas, on='community_area', how='left')
df1.head()

Unnamed: 0,unique_key,date,block,primary_type,description,ward,community_area,community
0,12465794,2021-08-27 10:00:00,104XX S MICHIGAN AVE,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER,9.0,49,ROSELAND
1,12788122,2022-08-08 11:42:00,007XX E 103RD ST,ROBBERY,AGGRAVATED VEHICULAR HIJACKING,9.0,50,PULLMAN
2,12502168,2021-10-04 08:10:00,0000X W 103RD ST,ROBBERY,ATTEMPT ARMED - HANDGUN,34.0,49,ROSELAND
3,13001633,2023-03-05 05:15:00,009XX E 104TH ST,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,9.0,50,PULLMAN
4,11830976,2019-09-16 11:53:00,009XX E 104TH ST,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,9.0,50,PULLMAN


In [None]:
# analysis of most common types and subtypes of crimes over 5 year span

# can do bar graph showing which ones were most common


In [83]:
pd.DataFrame(df['primary_type'].value_counts()).head(10)

Unnamed: 0_level_0,count
primary_type,Unnamed: 1_level_1
THEFT,242353
BATTERY,206458
CRIMINAL DAMAGE,126935
ASSAULT,97394
DECEPTIVE PRACTICE,83666
MOTOR VEHICLE THEFT,73562
OTHER OFFENSE,69693
ROBBERY,40811
WEAPONS VIOLATION,39386
BURGLARY,38242


In [None]:
# analysis by community area (neighborhood/location) / major streets or popular locations?

# map? can compare different community areas / neighborhoods

In [84]:
murder_df = df1[df1['primary_type'] == 'HOMICIDE']
murder_df = murder_df.groupby('community').size().reset_index(name='murders')
murder_df.sort_values(by='murders',ascending=False).head(10)

Unnamed: 0,community,murders
5,AUSTIN,294
51,NORTH LAWNDALE,163
63,SOUTH SHORE,161
69,WEST GARFIELD PARK,153
4,AUBURN GRESHAM,148
32,HUMBOLDT PARK,146
29,GREATER GRAND CROSSING,143
68,WEST ENGLEWOOD,129
23,ENGLEWOOD,125
59,ROSELAND,122


In [85]:
pd.DataFrame(df1['community'].value_counts()).head(10)

Unnamed: 0_level_0,count
community,Unnamed: 1_level_1
AUSTIN,61169
NEAR NORTH SIDE,47274
NEAR WEST SIDE,40137
SOUTH SHORE,38987
LOOP,36044
NORTH LAWNDALE,35165
HUMBOLDT PARK,32004
WEST TOWN,31558
AUBURN GRESHAM,31504
GREATER GRAND CROSSING,29430


In [None]:
# time of year / season / by months

# number of crimes (or violent crimes) per month / season - visualization

In [95]:
df1['month'] = df1['date'].dt.strftime('%B')
df1

Unnamed: 0,unique_key,date,block,primary_type,description,ward,community_area,community,month
0,12465794,2021-08-27 10:00:00,104XX S MICHIGAN AVE,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER,9.0,49,ROSELAND,August
1,12788122,2022-08-08 11:42:00,007XX E 103RD ST,ROBBERY,AGGRAVATED VEHICULAR HIJACKING,9.0,50,PULLMAN,August
2,12502168,2021-10-04 08:10:00,0000X W 103RD ST,ROBBERY,ATTEMPT ARMED - HANDGUN,34.0,49,ROSELAND,October
3,13001633,2023-03-05 05:15:00,009XX E 104TH ST,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,9.0,50,PULLMAN,March
4,11830976,2019-09-16 11:53:00,009XX E 104TH ST,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,9.0,50,PULLMAN,September
...,...,...,...,...,...,...,...,...,...
1120315,12052020,2020-05-15 05:30:00,053XX S DORCHESTER AVE,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,4.0,41,HYDE PARK,May
1120316,12594681,2022-01-17 12:30:00,016XX E 56TH ST,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,5.0,41,HYDE PARK,January
1120317,12099949,2020-06-28 09:00:00,048XX N DAMEN AVE,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,47.0,4,LINCOLN SQUARE,June
1120318,11814121,2019-08-30 12:00:00,052XX W FERDINAND ST,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,37.0,25,AUSTIN,August


In [107]:
season_df = df1.copy()
season_df['season'] = ''

season_map = {
    'December': 'Winter',
    'January': 'Winter',
    'February': 'Winter',
    'March': 'Spring',
    'April': 'Spring',
    'May': 'Spring',
    'June': 'Summer',
    'July': 'Summer',
    'August': 'Summer',
    'September': 'Fall',
    'October': 'Fall',
    'November': 'Fall'
}

# Use the map function to create the 'season' column
season_df['season'] = season_df['month'].map(season_map)
season_df

Unnamed: 0,unique_key,date,block,primary_type,description,ward,community_area,community,month,season
0,12465794,2021-08-27 10:00:00,104XX S MICHIGAN AVE,CRIMINAL SEXUAL ASSAULT,AGGRAVATED - OTHER,9.0,49,ROSELAND,August,Summer
1,12788122,2022-08-08 11:42:00,007XX E 103RD ST,ROBBERY,AGGRAVATED VEHICULAR HIJACKING,9.0,50,PULLMAN,August,Summer
2,12502168,2021-10-04 08:10:00,0000X W 103RD ST,ROBBERY,ATTEMPT ARMED - HANDGUN,34.0,49,ROSELAND,October,Fall
3,13001633,2023-03-05 05:15:00,009XX E 104TH ST,PUBLIC PEACE VIOLATION,RECKLESS CONDUCT,9.0,50,PULLMAN,March,Spring
4,11830976,2019-09-16 11:53:00,009XX E 104TH ST,WEAPONS VIOLATION,UNLAWFUL POSS OF HANDGUN,9.0,50,PULLMAN,September,Fall
...,...,...,...,...,...,...,...,...,...,...
1120315,12052020,2020-05-15 05:30:00,053XX S DORCHESTER AVE,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,4.0,41,HYDE PARK,May,Spring
1120316,12594681,2022-01-17 12:30:00,016XX E 56TH ST,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,5.0,41,HYDE PARK,January,Winter
1120317,12099949,2020-06-28 09:00:00,048XX N DAMEN AVE,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,47.0,4,LINCOLN SQUARE,June,Summer
1120318,11814121,2019-08-30 12:00:00,052XX W FERDINAND ST,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,37.0,25,AUSTIN,August,Summer


In [96]:
pd.DataFrame(df1['month'].value_counts()).head(10)

Unnamed: 0_level_0,count
month,Unnamed: 1_level_1
July,109796
August,109008
September,103883
June,103381
May,101323
January,92546
March,92344
April,88187
October,87129
February,83317


In [102]:
murder1_df = df1[df1['primary_type'] == 'HOMICIDE']
murder1_df = pd.DataFrame(murder1_df['month'].value_counts())
murder1_df

Unnamed: 0_level_0,count
month,Unnamed: 1_level_1
July,394
June,375
September,346
May,337
August,327
April,287
October,250
November,216
January,213
December,205
