In [1]:
import pandas as pd

crime = pd.read_csv(r"\Users\singh\Downloads\crime_with_weather.csv")

  crime = pd.read_csv(r"\Users\singh\Downloads\crime_with_weather.csv")


In [2]:
crime.columns

Index(['object_id', 'primary_key', 'case_number', 'district', 'ucr_desc',
       'time_group', 'reported_date', 'offense_month', 'offense_day',
       'time_block', 'dow_name', 'dow', 'hour_of_day', 'days_ago',
       'offense_date', 'statute', 'city', 'zip', 'stat_desc', 'address_public',
       'std_parcelpin', 'ward', 'census_tract', 'census_tract_geoid',
       'census_block_group', 'census_bg_geoid', 'census_block',
       'census_block_geoid', 'lat', 'lon', 'offense_year', 'geoid', 'date',
       'temp_max', 'temp_min', 'daylight_duration', 'precipitation_sum',
       'precipitation_hours'],
      dtype='object')

In [3]:
cols = ['census_tract_geoid', 'census_block_group', 'census_bg_geoid', 'census_block_geoid', 'std_parcelpin',
        'address_public', 'object_id', 'primary_key', 'case_number', 'reported_date', 'dow_name', 'statute', 
        'stat_desc', 'date', 'days_ago', 'geoid', 'city',
        'zip', 'ward', 'primary_key', 'district', 'time_group', 'census_tract', 'time_block']

crime = crime.drop(columns=cols)

In [4]:
cols = ['census_block']

for col in cols:
    count_not_located = (crime[col] == 'Not Located').sum()
    print(f"'{col}' has {count_not_located} 'Not Located' entries")

'census_block' has 0 'Not Located' entries


In [5]:
def add_extracted_date_columns(crime):
    crime['offense_date'] = pd.to_datetime(crime['offense_date'], errors='coerce')
    crime.dropna(subset=['offense_date'], inplace=True)  # Drop rows where 'offense_date' is NaT
    crime['extracted_month'] = crime['offense_date'].dt.month
    crime['extracted_day'] = crime['offense_date'].dt.day
    return crime

def date_discrepancy(crime, month, day):
    inconsistent_records = crime[(crime[month] != crime['extracted_month']) | (crime[day] != crime['extracted_day'])]
    return inconsistent_records.shape[0]

crime = add_extracted_date_columns(crime)

initial_discrepancies = date_discrepancy(crime, 'offense_month', 'offense_day')
print(f"Initial discrepancies: {initial_discrepancies}")

crime = crime.drop(columns=['offense_month', 'offense_day'])

final_discrepancies = date_discrepancy(crime, 'extracted_month', 'extracted_day')
print(f"Final discrepancies: {final_discrepancies}")

Initial discrepancies: 128136
Final discrepancies: 0


In [6]:
# Only use entries from years between 2018-2022 (Census data is for those years)
# Years before do not have enough values

#crime = crime[(crime['offense_year'] > 2017) & (crime['offense_year'] < 2023)]
#crime.shape[0]

In [7]:
# Look at rows with null entries

rows_with_null = crime[crime.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,ucr_desc,dow,hour_of_day,offense_date,census_block,lat,lon,offense_year,temp_max,temp_min,daylight_duration,precipitation_sum,precipitation_hours,extracted_month,extracted_day


In [8]:
duplicate_rows = crime[crime.duplicated(keep=False)]
print(f"Total exact duplicates found: {duplicate_rows.shape[0]}")

Total exact duplicates found: 48265


In [9]:
# Drop exact duplicates and keep only the first occurrence
crime = crime.drop_duplicates()
duplicate_rows = crime[crime.duplicated(keep=False)]
print(f"Total exact duplicates found: {duplicate_rows.shape[0]}")

Total exact duplicates found: 0


In [10]:
# We now have a "complete" dataset

crime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 498581 entries, 0 to 524389
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   ucr_desc             498581 non-null  object        
 1   dow                  498581 non-null  int64         
 2   hour_of_day          498581 non-null  int64         
 3   offense_date         498581 non-null  datetime64[ns]
 4   census_block         498581 non-null  object        
 5   lat                  498581 non-null  float64       
 6   lon                  498581 non-null  float64       
 7   offense_year         498581 non-null  int64         
 8   temp_max             498581 non-null  float64       
 9   temp_min             498581 non-null  float64       
 10  daylight_duration    498581 non-null  float64       
 11  precipitation_sum    498581 non-null  float64       
 12  precipitation_hours  498581 non-null  float64       
 13  extracted_month    

In [11]:
import numpy as np

# Adjust the dow to start from 0 by subtracting the minimum value (1)
crime['dow_sin'] = np.sin(2 * np.pi * (crime['dow'] - 1) / 7)
crime['dow_cos'] = np.cos(2 * np.pi * (crime['dow'] - 1) / 7)

# Adjust the extracted_day to start from 0 by subtracting the minimum value (1)
crime['day_sin'] = np.sin(2 * np.pi * (crime['extracted_day'] - 1) / 31)
crime['day_cos'] = np.cos(2 * np.pi * (crime['extracted_day'] - 1) / 31)

# Adjust the extracted_month to start from 0 by subtracting the minimum value (1)
crime['month_sin'] = np.sin(2 * np.pi * (crime['extracted_month'] - 1) / 12)
crime['month_cos'] = np.cos(2 * np.pi * (crime['extracted_month'] - 1) / 12)

crime['hour_sin'] = np.sin(2 * np.pi * crime['hour_of_day'] / 24)
crime['hour_cos'] = np.cos(2 * np.pi * crime['hour_of_day'] / 24)

crime = crime.drop(columns=['dow', 'hour_of_day', 'extracted_month', 'extracted_day'])

In [12]:
from sklearn.preprocessing import LabelEncoder

mappings = {}

cols = ['ucr_desc', 'offense_year', 'census_block']
for col in cols:
    le = LabelEncoder()
    crime[col + '_numeric'] = le.fit_transform(crime[col])
    
    mappings[col] = dict(zip(le.classes_, le.transform(le.classes_)))
    
for col, mapping in mappings.items():
    print(f"{col}: {mapping}")
    print("\n")

ucr_desc: {'1C': 0, 'All Other Offenses': 1, 'Arson': 2, 'Assault': 3, 'Burglary': 4, 'Disorderly Conduct': 5, 'Driving Under The Influence': 6, 'Drug Abuse Violations': 7, 'Drunkenness': 8, 'Embezzlement': 9, 'Fel Assault': 10, 'Forgery & Counterfeiting': 11, 'Fraud': 12, 'GTMV': 13, 'Gambling': 14, 'Homicide': 15, 'Liquor Laws': 16, 'Offense Against Family/Children': 17, 'Property Damaged': 18, 'Prostitution': 19, 'Rape': 20, 'Robbery': 21, 'Sex Offenses': 22, 'Stolen Property': 23, 'Theft': 24, 'Traffic Violations': 25, 'Vandalism': 26, 'Weapons': 27}


offense_year: {2018: 0, 2019: 1, 2020: 2, 2021: 3, 2022: 4, 2023: 5, 2024: 6}


census_block: {'Block 1000': 0, 'Block 1001': 1, 'Block 1002': 2, 'Block 1003': 3, 'Block 1004': 4, 'Block 1005': 5, 'Block 1006': 6, 'Block 1007': 7, 'Block 1008': 8, 'Block 1009': 9, 'Block 1010': 10, 'Block 1011': 11, 'Block 1012': 12, 'Block 1013': 13, 'Block 1014': 14, 'Block 1015': 15, 'Block 1016': 16, 'Block 1017': 17, 'Block 1018': 18, 'Block 101

In [14]:
# Drop last remaining unneeded columns
# Only remaining column that may be unnessary is OffenseDate
# It is useful to have a column in datetime format

cols = ['ucr_desc', 'offense_year', 'census_block']

crime = crime.drop(columns=cols)

In [15]:
# Calculate the week of the year from 'offense_date'
crime['week_of_year'] = crime['offense_date'].dt.isocalendar().week

# Create interaction features
crime['week_precipitation_interaction'] = crime['week_of_year'] * crime['precipitation_sum']
crime['daylight_precipitation_interaction'] = crime['daylight_duration'] * crime['precipitation_sum']

crime['block_week_interaction'] = crime['census_block_numeric'] * crime['week_of_year']
crime['block_temp_max_interaction'] = crime['census_block_numeric'] * crime['temp_max']

crime['temp_range'] = crime['temp_max'] - crime['temp_min']
crime['temp_range_precipitation_interaction'] = crime['temp_range'] * crime['precipitation_sum']

crime['precipitation_sum_hours_interaction'] = crime['precipitation_sum'] * crime['precipitation_hours']

from pandas.tseries.holiday import USFederalHolidayCalendar

cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=crime['offense_date'].min(), end=crime['offense_date'].max())
crime['is_holiday'] = crime['offense_date'].dt.normalize().isin(holidays).astype(int)

In [16]:
# Look at rows with null entries

rows_with_null = crime[crime.isnull().any(axis=1)]
rows_with_null

Unnamed: 0,offense_date,lat,lon,temp_max,temp_min,daylight_duration,precipitation_sum,precipitation_hours,dow_sin,dow_cos,...,census_block_numeric,week_of_year,week_precipitation_interaction,daylight_precipitation_interaction,block_week_interaction,block_temp_max_interaction,temp_range,temp_range_precipitation_interaction,precipitation_sum_hours_interaction,is_holiday


In [17]:
duplicate_rows = crime[crime.duplicated(keep=False)]
print(f"Total exact duplicates found: {duplicate_rows.shape[0]}")

Total exact duplicates found: 0


In [18]:
crime.columns

Index(['offense_date', 'lat', 'lon', 'temp_max', 'temp_min',
       'daylight_duration', 'precipitation_sum', 'precipitation_hours',
       'dow_sin', 'dow_cos', 'day_sin', 'day_cos', 'month_sin', 'month_cos',
       'hour_sin', 'hour_cos', 'ucr_desc_numeric', 'offense_year_numeric',
       'census_block_numeric', 'week_of_year',
       'week_precipitation_interaction', 'daylight_precipitation_interaction',
       'block_week_interaction', 'block_temp_max_interaction', 'temp_range',
       'temp_range_precipitation_interaction',
       'precipitation_sum_hours_interaction', 'is_holiday'],
      dtype='object')

In [19]:
# Important to note that we do not perform any feature scaling here
# This is because random forests are scale-invariant

crime.to_csv("crime_weather_preprocessed", index=False)