## Imports

In [1]:
import os, glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
# Merge .csv files
joined_files = os.path.join('/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago','Chicago-Crime*.csv')
print(joined_files)

# Return list of all joined files
joined_list = glob.glob(joined_files)
print(joined_list)

# # Concatenate dataframe
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime*.csv
['/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2019.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2018.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2008.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2020.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2021.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2009.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2023.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2022.csv', '/Users/kevinbarnett/Documents/CodingDojo/Project 4 Crime Data/Data/Chicago/Chicago-Crime_2007.cs

In [3]:
# Confirm data loaded correctly
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7819943 entries, 0 to 7819942
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Date                  object 
 2   Primary Type          object 
 3   Description           object 
 4   Location Description  object 
 5   Arrest                bool   
 6   Domestic              bool   
 7   Beat                  int64  
 8   District              float64
 9   Ward                  float64
 10  Latitude              float64
 11  Longitude             float64
dtypes: bool(2), float64(4), int64(2), object(4)
memory usage: 611.5+ MB


Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
0,11554161,01/01/2019 01:00:00 AM,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,1812,18.0,43.0,41.922751,-87.644994
1,12172257,01/01/2019 01:00:00 PM,DECEPTIVE PRACTICE,COUNTERFEITING DOCUMENT,RESIDENCE,False,False,1614,16.0,41.0,41.974061,-87.8452
2,11553582,01/01/2019 01:00:00 AM,THEFT,OVER $500,APARTMENT,False,True,1923,19.0,46.0,41.94946,-87.651974
3,11574022,01/01/2019 01:00:00 AM,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,RESIDENCE,False,False,1834,18.0,42.0,41.892659,-87.6166
4,11562299,01/01/2019 01:00:00 PM,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",OTHER,False,False,1932,19.0,43.0,41.925284,-87.658967


In [4]:
# Make column names lowercase
df.columns = df.columns.str.lower()

In [5]:
df['id'].value_counts().sum()

# Remove 'id' column
df = df.drop(columns='id')
df.head()

Unnamed: 0,date,primary type,description,location description,arrest,domestic,beat,district,ward,latitude,longitude
0,01/01/2019 01:00:00 AM,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,1812,18.0,43.0,41.922751,-87.644994
1,01/01/2019 01:00:00 PM,DECEPTIVE PRACTICE,COUNTERFEITING DOCUMENT,RESIDENCE,False,False,1614,16.0,41.0,41.974061,-87.8452
2,01/01/2019 01:00:00 AM,THEFT,OVER $500,APARTMENT,False,True,1923,19.0,46.0,41.94946,-87.651974
3,01/01/2019 01:00:00 AM,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,RESIDENCE,False,False,1834,18.0,42.0,41.892659,-87.6166
4,01/01/2019 01:00:00 PM,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",OTHER,False,False,1932,19.0,43.0,41.925284,-87.658967


## Prepare Data for Time Series Analysis

In [6]:
# Create copy of original dataframe
df_ts = df.copy()

In [7]:
# Convert 'date' column to datetime dtype
df_ts['date'] = pd.to_datetime(df['date']).dt.time
df_ts.info()
df_ts.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7819943 entries, 0 to 7819942
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   date                  object 
 1   primary type          object 
 2   description           object 
 3   location description  object 
 4   arrest                bool   
 5   domestic              bool   
 6   beat                  int64  
 7   district              float64
 8   ward                  float64
 9   latitude              float64
 10  longitude             float64
dtypes: bool(2), float64(4), int64(1), object(4)
memory usage: 551.9+ MB


Unnamed: 0,date,primary type,description,location description,arrest,domestic,beat,district,ward,latitude,longitude
0,01:00:00,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,1812,18.0,43.0,41.922751,-87.644994
1,13:00:00,DECEPTIVE PRACTICE,COUNTERFEITING DOCUMENT,RESIDENCE,False,False,1614,16.0,41.0,41.974061,-87.8452
2,01:00:00,THEFT,OVER $500,APARTMENT,False,True,1923,19.0,46.0,41.94946,-87.651974
3,01:00:00,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,RESIDENCE,False,False,1834,18.0,42.0,41.892659,-87.6166
4,13:00:00,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",OTHER,False,False,1932,19.0,43.0,41.925284,-87.658967


In [8]:
df_ts['date'] = pd.to_datetime(df['date'])

In [9]:
df_ts.dtypes

date                    datetime64[ns]
primary type                    object
description                     object
location description            object
arrest                            bool
domestic                          bool
beat                             int64
district                       float64
ward                           float64
latitude                       float64
longitude                      float64
dtype: object

In [10]:
# Set 'date' column as index
df_ts = df_ts.set_index('date')
df_ts.head()

Unnamed: 0_level_0,primary type,description,location description,arrest,domestic,beat,district,ward,latitude,longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-01 01:00:00,THEFT,FROM BUILDING,BAR OR TAVERN,False,False,1812,18.0,43.0,41.922751,-87.644994
2019-01-01 13:00:00,DECEPTIVE PRACTICE,COUNTERFEITING DOCUMENT,RESIDENCE,False,False,1614,16.0,41.0,41.974061,-87.8452
2019-01-01 01:00:00,THEFT,OVER $500,APARTMENT,False,True,1923,19.0,46.0,41.94946,-87.651974
2019-01-01 01:00:00,CRIM SEXUAL ASSAULT,NON-AGGRAVATED,RESIDENCE,False,False,1834,18.0,42.0,41.892659,-87.6166
2019-01-01 13:00:00,DECEPTIVE PRACTICE,"THEFT BY LESSEE,MOTOR VEH",OTHER,False,False,1932,19.0,43.0,41.925284,-87.658967


In [11]:
# Check for missing values
df_ts.isna().sum()

primary type                 0
description                  0
location description     10609
arrest                       0
domestic                     0
beat                         0
district                    47
ward                    614848
latitude                 87672
longitude                87672
dtype: int64

In [12]:
df_ts['location description'].value_counts()

STREET                   2036937
RESIDENCE                1310234
APARTMENT                 885985
SIDEWALK                  730602
OTHER                     270018
                          ...   
POOLROOM                       1
EXPRESSWAY EMBANKMENT          1
CLEANERS/LAUNDROMAT            1
LAGOON                         1
TRUCKING TERMINAL              1
Name: location description, Length: 216, dtype: int64

In [35]:
# Creating a list of dataframe columns
columns = list(df_ts)

# For loop to iterate over column and pull value counts
for col in columns:
    if df_ts[col].dtypes == 'object' or 'bool': 
        print('Column: ',col)
        print (df_ts[col].value_counts(),'\n')
    if df_ts[col].dtypes == 'int64' or 'float64':
        pass

Column:  primary type
THEFT                                1649683
BATTERY                              1429194
CRIMINAL DAMAGE                       891283
NARCOTICS                             748277
ASSAULT                               510463
OTHER OFFENSE                         485627
BURGLARY                              425402
MOTOR VEHICLE THEFT                   379255
DECEPTIVE PRACTICE                    347433
ROBBERY                               293539
CRIMINAL TRESPASS                     214924
WEAPONS VIOLATION                     107650
PROSTITUTION                           69857
OFFENSE INVOLVING CHILDREN             56031
PUBLIC PEACE VIOLATION                 52454
SEX OFFENSE                            30885
CRIM SEXUAL ASSAULT                    27611
INTERFERENCE WITH PUBLIC OFFICER       18464
LIQUOR LAW VIOLATION                   14934
GAMBLING                               14619
ARSON                                  13335
HOMICIDE                         