## Imports

In [10]:
import os, glob

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load Data

In [2]:
# Merge .csv files
joined_files = os.path.join('\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago','Chicago-Crime*.csv')
print(joined_files)

# Return list of all joined files
joined_list = glob.glob(joined_files)
print(joined_list)

# # Concatenate dataframe
df = pd.concat(map(pd.read_csv, joined_list), ignore_index=True)

\Users\hamma\Documents\GitHub\Project-4-Crime-Data\Data\Chicago\Chicago-Crime*.csv
['\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2001.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2002.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2003.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2004.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2005.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2006.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2007.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2008.csv', '\\Users\\hamma\\Documents\\GitHub\\Project-4-Crime-Data\\Data\\Chicago\\Chicago-Crime_2009.csv', '\\Users\\hamma\\Documents\\GitHub

In [3]:
# Confirm data loaded correctly
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7819943 entries, 0 to 7819942
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   ID                    int64  
 1   Date                  object 
 2   Primary Type          object 
 3   Description           object 
 4   Location Description  object 
 5   Arrest                bool   
 6   Domestic              bool   
 7   Beat                  int64  
 8   District              float64
 9   Ward                  float64
 10  Latitude              float64
 11  Longitude             float64
dtypes: bool(2), float64(4), int64(2), object(4)
memory usage: 611.5+ MB


Unnamed: 0,ID,Date,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Latitude,Longitude
0,1427622,01/01/2001 01:00:00 PM,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,2521,25.0,,41.929707,-87.739828
1,1316324,01/01/2001 01:00:00 PM,THEFT,OVER $500,STREET,False,False,1513,15.0,,41.869008,-87.773947
2,1319931,01/01/2001 01:00:00 PM,BATTERY,SIMPLE,RESIDENCE,False,False,825,8.0,,41.783892,-87.684841
3,1584605,01/01/2001 01:00:00 AM,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,1434,14.0,,41.915639,-87.688701
4,1311503,01/01/2001 01:00:00 AM,CRIM SEXUAL ASSAULT,AGGRAVATED: OTHER DANG WEAPON,BAR OR TAVERN,False,False,1933,19.0,,41.931374,-87.648819


In [6]:
# Make column names lowercase
df.columns = df.columns.str.lower()

In [9]:
df['id'].value_counts().sum()

# Remove 'id' column
df = df.drop(columns='id')
df.head()

Unnamed: 0,date,primary type,description,location description,arrest,domestic,beat,district,ward,latitude,longitude
0,01/01/2001 01:00:00 PM,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,2521,25.0,,41.929707,-87.739828
1,01/01/2001 01:00:00 PM,THEFT,OVER $500,STREET,False,False,1513,15.0,,41.869008,-87.773947
2,01/01/2001 01:00:00 PM,BATTERY,SIMPLE,RESIDENCE,False,False,825,8.0,,41.783892,-87.684841
3,01/01/2001 01:00:00 AM,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,1434,14.0,,41.915639,-87.688701
4,01/01/2001 01:00:00 AM,CRIM SEXUAL ASSAULT,AGGRAVATED: OTHER DANG WEAPON,BAR OR TAVERN,False,False,1933,19.0,,41.931374,-87.648819


## Prepare Data for Time Series Analysis

In [11]:
# Create copy of original dataframe
df_ts = df.copy()

In [34]:
# Convert 'date' column to datetime dtype
df_ts['date'] = pd.to_datetime(df['date']).dt.time
df_ts.info()
df_ts.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7819943 entries, 0 to 7819942
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   date                  object 
 1   primary type          object 
 2   description           object 
 3   location description  object 
 4   arrest                bool   
 5   domestic              bool   
 6   beat                  int64  
 7   district              float64
 8   ward                  float64
 9   latitude              float64
 10  longitude             float64
dtypes: bool(2), float64(4), int64(1), object(4)
memory usage: 551.9+ MB


Unnamed: 0,date,primary type,description,location description,arrest,domestic,beat,district,ward,latitude,longitude
0,13:00:00,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,2521,25.0,,41.929707,-87.739828
1,13:00:00,THEFT,OVER $500,STREET,False,False,1513,15.0,,41.869008,-87.773947
2,13:00:00,BATTERY,SIMPLE,RESIDENCE,False,False,825,8.0,,41.783892,-87.684841
3,01:00:00,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,1434,14.0,,41.915639,-87.688701
4,01:00:00,CRIM SEXUAL ASSAULT,AGGRAVATED: OTHER DANG WEAPON,BAR OR TAVERN,False,False,1933,19.0,,41.931374,-87.648819


In [None]:
df_ts['date'] = pd.to_datetime(df['date'])

In [39]:
df_ts.dtypes

date                    datetime64[ns]
primary type                    object
description                     object
location description            object
arrest                            bool
domestic                          bool
beat                             int64
district                       float64
ward                           float64
latitude                       float64
longitude                      float64
dtype: object

In [40]:
# Set 'date' column as index
df_ts = df_ts.set_index('date')
df_ts.head()

Unnamed: 0_level_0,primary type,description,location description,arrest,domestic,beat,district,ward,latitude,longitude
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2001-01-01 13:00:00,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,2521,25.0,,41.929707,-87.739828
2001-01-01 13:00:00,THEFT,OVER $500,STREET,False,False,1513,15.0,,41.869008,-87.773947
2001-01-01 13:00:00,BATTERY,SIMPLE,RESIDENCE,False,False,825,8.0,,41.783892,-87.684841
2001-01-01 01:00:00,OTHER OFFENSE,HARASSMENT BY TELEPHONE,RESIDENCE,False,False,1434,14.0,,41.915639,-87.688701
2001-01-01 01:00:00,CRIM SEXUAL ASSAULT,AGGRAVATED: OTHER DANG WEAPON,BAR OR TAVERN,False,False,1933,19.0,,41.931374,-87.648819


In [41]:
# Check for missing values
df_ts.isna().sum()

primary type                 0
description                  0
location description     10609
arrest                       0
domestic                     0
beat                         0
district                    47
ward                    614848
latitude                 87672
longitude                87672
dtype: int64

In [42]:
df_ts['location description'].value_counts()

STREET                    2036937
RESIDENCE                 1310234
APARTMENT                  885985
SIDEWALK                   730602
OTHER                      270018
                           ...   
JUNK YARD/GARBAGE DUMP          1
FUNERAL PARLOR                  1
TRUCKING TERMINAL               1
LIVERY AUTO                     1
ROOF                            1
Name: location description, Length: 216, dtype: int64