<a href="https://colab.research.google.com/github/harshakoneru98/city_watch/blob/main/modelling/Crime_Rate_Forecasting_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import json
import tqdm
import seaborn as sns
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot as plt
import datetime
from sklearn.ensemble import GradientBoostingRegressor
ROOT_PATH = "/content/drive/MyDrive/DSCI 560/Datasets/Cleaned_data/"

In [None]:
crimes = pd.read_csv(f"{ROOT_PATH}/crime_date_aligned.csv")

In [None]:
crime_by_date_zipcode = crimes.groupby(["date_occ", "zip_code"]).agg({ "dr_no": "count"}).reset_index()

In [None]:
import pandas as pd

# Convert the 'date' column to datetime format
crime_by_date_zipcode['date'] = pd.to_datetime(crime_by_date_zipcode['date_occ'])

# Create a new column for the day of the week
crime_by_date_zipcode['day_of_week'] = crime_by_date_zipcode['date'].dt.dayofweek

# Create a new column for the week of the year
crime_by_date_zipcode['week_of_year'] = crime_by_date_zipcode['date'].dt.weekofyear

# Create a new column for the day of the month
crime_by_date_zipcode['day_of_month'] = crime_by_date_zipcode['date'].dt.day

# Create a new column for the month of the year
crime_by_date_zipcode['month_of_year'] = crime_by_date_zipcode['date'].dt.month

# Create a new column for the year
crime_by_date_zipcode['year'] = crime_by_date_zipcode['date'].dt.year

# Define the date offset for the rolling window

# Create a rolling window for the previous 7 days of crimes
crime_by_date_zipcode['previous_7_days'] = crime_by_date_zipcode.set_index('date').groupby('zip_code')['dr_no'].rolling(window="7D").count().values
crime_by_date_zipcode['previous_14_days'] = crime_by_date_zipcode.set_index('date').groupby('zip_code')['dr_no'].rolling(window="14D").count().values
crime_by_date_zipcode['previous_28_days'] = crime_by_date_zipcode.set_index('date').groupby('zip_code')['dr_no'].rolling(window="28D").count().values
# Define the date offset for the exponential moving average window
ema_date_offset = pd.DateOffset(days=1)

# Create exponential moving averages for the previous 7 days of crimes
crime_by_date_zipcode['ema_7_days'] = crime_by_date_zipcode.set_index('date').groupby('zip_code')['dr_no'].ewm(span=7, min_periods=1).mean().values
crime_by_date_zipcode['ema_14_days'] = crime_by_date_zipcode.set_index('date').groupby('zip_code')['dr_no'].ewm(span=14, min_periods=1).mean().values
crime_by_date_zipcode['ema_28_days'] = crime_by_date_zipcode.set_index('date').groupby('zip_code')['dr_no'].ewm(span=28, min_periods=1).mean().values

# Create difference features for the rolling window and exponential moving average
crime_by_date_zipcode['diff_7_days'] = crime_by_date_zipcode.groupby('zip_code')['previous_7_days'].diff()
crime_by_date_zipcode['diff_ema_7_days'] = crime_by_date_zipcode.groupby('zip_code')['ema_7_days'].diff()
crime_by_date_zipcode['diff_14_days'] = crime_by_date_zipcode.groupby('zip_code')['previous_14_days'].diff()
crime_by_date_zipcode['diff_ema_14_days'] = crime_by_date_zipcode.groupby('zip_code')['ema_14_days'].diff()
crime_by_date_zipcode['diff_28_days'] = crime_by_date_zipcode.groupby('zip_code')['previous_14_days'].diff()
crime_by_date_zipcode['diff_ema_28_days'] = crime_by_date_zipcode.groupby('zip_code')['ema_14_days'].diff()

# Drop any rows that have NaN values (i.e. the first 7 rows)
crime_by_date_zipcode = crime_by_date_zipcode.dropna()

  crime_by_date_zipcode['week_of_year'] = crime_by_date_zipcode['date'].dt.weekofyear


In [None]:
crime_by_date_zipcode.head(5)

Unnamed: 0,date_occ,zip_code,dr_no,date,day_of_week,week_of_year,day_of_month,month_of_year,year,previous_7_days,...,previous_28_days,ema_7_days,ema_14_days,ema_28_days,diff_7_days,diff_ema_7_days,diff_14_days,diff_ema_14_days,diff_28_days,diff_ema_28_days
105,2013-01-03,90001,1,2013-01-03,3,1,3,1,2013,2.0,...,6.0,1.160465,1.186353,1.205572,1.0,-0.839535,1.0,-0.813647,1.0,-0.813647
106,2013-01-03,90002,3,2013-01-03,3,1,3,1,2013,1.0,...,2.0,1.120349,1.161506,1.191387,-1.0,-0.308223,-1.0,-0.30278,-1.0,-0.30278
107,2013-01-03,90003,8,2013-01-03,3,1,3,1,2013,1.0,...,2.0,1.090261,1.139972,1.178182,-2.0,-0.152982,-2.0,-0.146955,-2.0,-0.146955
108,2013-01-03,90004,3,2013-01-03,3,1,3,1,2013,1.0,...,2.0,1.067696,1.121309,1.165889,0.0,-0.08659,0.0,-0.077839,0.0,-0.077839
109,2013-01-03,90005,12,2013-01-03,3,1,3,1,2013,2.0,...,3.0,1.050772,1.105134,1.154444,0.0,-0.052941,1.0,-0.042056,1.0,-0.042056


In [None]:
train_data = crime_by_date_zipcode[crime_by_date_zipcode['date'] <= datetime.datetime(2022, 9, 30)]
test_data = crime_by_date_zipcode[crime_by_date_zipcode['date'] > datetime.datetime(2022, 9, 30)]
IGNORED_COLUMNS = ['date', 'zip_code', 'date_occ']
input_columns = [x for x in train_data.columns if x not in IGNORED_COLUMNS + ['diff_7_days']]
train_X = train_data[input_columns]
train_Y = train_data['diff_7_days']
test_X = test_data[input_columns]
test_Y = test_data['diff_7_days']

In [None]:
# univariate ARIMA
# Fit the ARIMA model
model = ARIMA(train_Y, order=(1, 0, 0))
model_fit = model.fit()

# Make predictions for the test set
predictions = model_fit.forecast(steps=len(test_Y))

# Calculate the mean squared error of the predictions
mse = mean_squared_error(test_Y, predictions)

print('Root Mean Squared Error:', np.sqrt(mse))

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(


Root Mean Squared Error: 1.180917526221279


In [None]:
# Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
model.fit(train_X, train_Y)

# Generate predictions on the test data
predictions = model.predict(test_X)

# Calculate the root mean squared error of the predictions
rmse = np.sqrt(mean_squared_error(predictions, test_Y))
print("Root Mean Squared Error", rmse)

Root Mean Squared Error 0.5381362836622471


In [None]:
# Define the SVM Regressor model
model = SVR(kernel='rbf', C=10, gamma=0.1)

# Fit the model to the training data
model.fit(train_X, train_Y)

# Generate predictions on the test data
predictions = model.predict(test_X)

# Calculate the root mean squared error of the predictions
rmse = np.sqrt(mean_squared_error(predictions, test_Y))
print(f'RMSE: {rmse}')

RMSE: 0.4747822271448964


In [None]:
# gradient boosting
# Define the Gradient Boosting Regressor model
model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)

# Fit the model to the training data
model.fit(train_X, train_Y)

# Generate predictions on the test data
predictions = model.predict(test_X)

# Calculate the root mean squared error of the predictions
rmse = np.sqrt(mean_squared_error(predictions, test_Y))
print(f'RMSE: {rmse}')


RMSE: 0.5214326891172795
