In [None]:
# All code in these cells was produced by ChatGPT except where modifications are noted

In [10]:
# Code to create a model based on ARIMA
# ChatGPT code modified as noted below to make output more readable
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
import warnings

# load the dataset
df = pd.read_pickle('data/chi_Chat_time_series_most_crime_CBGs_06_19.p')

# convert the date column to a datetime type and set it as the index
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# select the relevant columns
features = ['Sanitation/Sewer Issue Average Time Open', 'Street Light Repairs Average Time Open', 'Street Repairs Average Time Open']

# create a list of unique Census Block Groups
cbgs = df['Census Block Group'].unique()

# loop through each Census Block Group
for cbg in cbgs:
    df_cbg = df[df['Census Block Group'] == cbg]
    X = df_cbg[features]
    y = df_cbg['Total Crime']
    
    # fit the ARIMA model
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model = ARIMA(y, order=(1, 1, 1))
        model_fit = model.fit()
    
    # make predictions
    predictions = model_fit.predict(start=len(y) - 5, end=len(y) - 1, dynamic=False)
    
    # compare the predictions to the actual values
    actual = y.iloc[-5:]
    # CODE BELOW MODIFED BY ME TO ENHANCE OUTPUT READABILITY
    print('\npredictions', predictions, '\nactuals', actual)
    accuracy = (predictions == actual).mean()
    
    print("Accuracy for Census Block Group", cbg)
    print(accuracy)



predictions 2022-05-22    5.101342
2022-05-29    5.339171
2022-06-05    5.404364
2022-06-12    4.943405
2022-06-19    4.924922
Freq: W-SUN, dtype: float64 
actuals Date
2022-05-22    6.0
2022-05-29    6.0
2022-06-05    3.0
2022-06-12    4.0
2022-06-19    NaN
Name: Total Crime, dtype: float64
Accuracy for Census Block Group 170313201021
0.0

predictions 2022-05-22    11.252198
2022-05-29    10.562051
2022-06-05    10.558614
2022-06-12    12.459191
2022-06-19    11.510830
Freq: W-SUN, dtype: float64 
actuals Date
2022-05-22     8.0
2022-05-29    10.0
2022-06-05    21.0
2022-06-12     9.0
2022-06-19     NaN
Name: Total Crime, dtype: float64
Accuracy for Census Block Group 170318391001
0.0

predictions 2022-05-22    5.136862
2022-05-29    4.933562
2022-06-05    5.105304
2022-06-12    5.212533
2022-06-19    5.579428
Freq: W-SUN, dtype: float64 
actuals Date
2022-05-22    3.0
2022-05-29    7.0
2022-06-05    6.0
2022-06-12    9.0
2022-06-19    NaN
Name: Total Crime, dtype: float64
Accuracy f

In [None]:
# code to create a VAR-based model
import pandas as pd
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.vector_ar.var_model import VAR
import warnings

# load the dataset
df = pd.read_pickle('data/chi_Chat_time_series_most_crime_CBGs_06_19.p')

# convert the date column to a datetime type and set it as the index
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# select the relevant columns
features = ['Sanitation/Sewer Issue Average Time Open', 'Street Light Repairs Average Time Open', 'Street Repairs Average Time Open', 'Total Crime']

# create a list of unique Census Block Groups
cbgs = df['Census Block Group'].unique()

# loop through each Census Block Group
for cbg in cbgs:
    df_cbg = df[df['Census Block Group'] == cbg]
    X = df_cbg[features]
    
    # replace NaN values with the mean of each feature
    X.fillna(X.mean(), inplace=True)
    
    # check if the data is stationary
    stationary = True
    for feature in features:
        result = adfuller(X[feature])
        if result[1] > 0.05:
            stationary = False
            break
    
    # if the data is not stationary, difference it
    if not stationary:
        X = X.diff().dropna()
    
    # fit the VAR model
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        model = VAR(X)
        model_fit = model.fit()
    
    # make predictions
    predictions = model_fit.forecast(model_fit.y, steps=5)
    
    # compare the predictions to the actual values
    actual = X.iloc[-5:, -1]
    accuracy = (predictions[:, -1] == actual).mean()
    
    print("Accuracy for Census Block Group", cbg)
    print(accuracy)


In [17]:
# Code for ADF test for stationarity
# NOTE: I modified the ChatGPT code slightly as noted below in order to
# improve the readability of the results
import pandas as pd
import statsmodels.tsa.stattools as ts

# Load the dataset
df = pd.read_pickle("data/chi_Chat_time_series_most_crime_CBGs_06_19_updated.p")

# Use the 'Date' column as the index
df.set_index('Date', inplace=True)

# Test each column for stationarity for each unique 'Census Block Group' value
cbg_values = df['Census Block Group'].unique()
for cbg in cbg_values:
    print('\n', f"Testing Census Block Group: {cbg}") # <-- extra line feed added to ChatGPT code for output readability
    cbg_df = df[df['Census Block Group'] == cbg]
    cbg_df = cbg_df.drop(['Census Block Group'], axis=1)
    
    for col in cbg_df.columns:
        result = ts.adfuller(cbg_df[col])
        if result[1] > 0.05:
            print(f"{col} stationarity test results: {result}", '\n') # <-- extra line feed added to ChatGPT code for output readability


 Testing Census Block Group: 170313201021
Sanitation/Sewer Issue Average Time Open stationarity test results: (-2.4406747590099314, 0.1305837144959538, 6, 166, {'1%': -3.4703698981001665, '5%': -2.8791138497902193, '10%': -2.576139407751488}, 1612.094516049277) 


 Testing Census Block Group: 170318391001
Sanitation/Sewer Issue Average Time Open stationarity test results: (-2.8093354572920313, 0.05695258837999469, 10, 162, {'1%': -3.471374345647024, '5%': -2.8795521079291966, '10%': -2.5763733302850174}, 1879.8306921381936) 

Total Crime stationarity test results: (-2.274271576840883, 0.18040019461492734, 7, 165, {'1%': -3.470616369591229, '5%': -2.8792214018977655, '10%': -2.57619681359045}, 856.8581947811163) 


 Testing Census Block Group: 170313204001
Total Crime stationarity test results: (-2.7972983661127926, 0.05866071472366195, 6, 166, {'1%': -3.4703698981001665, '5%': -2.8791138497902193, '10%': -2.576139407751488}, 763.1510165308146) 


 Testing Census Block Group: 170318424

In [1]:
# Code to create differenced data
import pandas as pd

# Load the dataframe from the file
df = pd.read_pickle("data/chi_Chat_time_series_most_crime_CBGs_06_19_updated.p")

# Group the dataframe by Census Block Group
grouped = df.groupby("Census Block Group")

# Initialize a list to store the differenced dataframes
differenced_dfs = []

# Loop over each group in the grouped dataframe
for name, group in grouped:
    # Difference all columns in the group, except for the Census Block Group column
    differenced_group = group.drop("Census Block Group", axis=1).diff().dropna()
    
    # Add the Census Block Group column back to the differenced data
    differenced_group["Census Block Group"] = name
    
    # Append the differenced group to the list of differenced dataframes
    differenced_dfs.append(differenced_group)

# Concatenate the differenced dataframes into a single dataframe
result = pd.concat(differenced_dfs)

# Set the Date column as the index of the result dataframe
result.set_index("Date", inplace=True)

# Save the result dataframe to a file
result.to_pickle("data/chi_Chat_time_series_most_crime_CBGs_06_19_differenced.p")

In [2]:
result

Unnamed: 0_level_0,Sanitation/Sewer Issue Average Time Open,Street Light Repairs Average Time Open,Street Repairs Average Time Open,Total Crime,Census Block Group
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7 days,0.000000,0.000000,0.000000,0.0,170313201021
7 days,0.840787,-11.592049,-11.478551,4.0,170313201021
7 days,0.000000,0.544873,-0.311848,-1.0,170313201021
7 days,1.850093,57.286738,0.000000,-3.0,170313201021
7 days,-1.827824,-63.988868,5.632411,1.0,170313201021
...,...,...,...,...,...
7 days,-0.076076,-163.031250,-9.655851,-4.0,170318429001
7 days,7.753657,0.000000,36.596047,4.0,170318429001
7 days,0.000000,-0.848252,0.000000,1.0,170318429001
7 days,0.000000,0.000000,0.000000,0.0,170318429001


In [5]:
# Code for Granger Causality test
# NOTE: See comments below for slight changes I made to ChatGPT's code
import pandas as pd
import statsmodels.tsa.stattools as ts

# Load the dataframe from the file
df = pd.read_pickle("data/chi_Chat_time_series_most_crime_CBGs_06_19_differenced.p")

# Group the dataframe by Census Block Group
grouped = df.groupby("Census Block Group")

# Initialize a list to store the results
results = []

# Loop over each group in the grouped dataframe
for name, group in grouped:
    # Select the columns to use for the Granger causality test
    input_cols = ['Sanitation/Sewer Issue Average Time Open', 
                  'Street Light Repairs Average Time Open', 
                  'Street Repairs Average Time Open', 
                  'Total Crime']
    
    # Perform the Granger causality test for each pair of input columns
    for i in range(len(input_cols)):
        for j in range(i+1, len(input_cols)):
            x = input_cols[i]
            y = input_cols[j]
            result = ts.grangercausalitytests(group[[x, y]], maxlag=12, verbose=False) # <-- increased maxlags to 12, or 1 calendar quarter
            p_value = result[2][0]["ssr_ftest"][1]
            results.append((name, x, y, p_value))

# Convert the results to a dataframe
results_df = pd.DataFrame(results, columns=["Census Block Group", "X", "Y", "P-Value"])

# Print the results
# print(results_df) <-- SUBSTITUTED LINE BELOW FOR THIS ChatGPT LINE IN ORDER TO FOCUS ON Total Crime P-VALUES 
results_df[results_df['Y'] == 'Total Crime']

Unnamed: 0,Census Block Group,X,Y,P-Value
2,170313201021,Sanitation/Sewer Issue Average Time Open,Total Crime,0.414585
4,170313201021,Street Light Repairs Average Time Open,Total Crime,0.962305
5,170313201021,Street Repairs Average Time Open,Total Crime,0.699811
8,170313204001,Sanitation/Sewer Issue Average Time Open,Total Crime,0.441792
10,170313204001,Street Light Repairs Average Time Open,Total Crime,0.846007
11,170313204001,Street Repairs Average Time Open,Total Crime,0.98311
14,170318391001,Sanitation/Sewer Issue Average Time Open,Total Crime,0.762241
16,170318391001,Street Light Repairs Average Time Open,Total Crime,0.170808
17,170318391001,Street Repairs Average Time Open,Total Crime,0.430337
20,170318424001,Sanitation/Sewer Issue Average Time Open,Total Crime,0.661549


In [40]:
# code to create a VAR-based model, this time with more explicit instructions given to ChatGPT on
# model parameters to be used, steps to take, etc.
import pandas as pd
import statsmodels.tsa.vector_ar.var_model as vm
import numpy as np

# Load the dataframe from the file
df = pd.read_pickle("data/chi_Chat_time_series_most_crime_CBGs_06_19_differenced.p")

# Group the dataframe by Census Block Group
grouped = df.groupby("Census Block Group")

# Initialize a list to store the results
results = []

# Loop over each group in the grouped dataframe
for name, group in grouped:
    # Select the columns to use for the VAR model
    input_cols = ['Sanitation/Sewer Issue Average Time Open', 
                  'Street Light Repairs Average Time Open', 
                  'Street Repairs Average Time Open', 
                  'Total Crime']
    
    # Split the group into training and test dataframes
    train = group[:-5]
    test = group[-5:]
    
    # Train the VAR model with an order of 1
    model = vm.VAR(train[input_cols])
    result = model.fit(maxlags=1)
    
    # Use the VAR model to predict the values in the test dataframe
    prediction = result.forecast(train[input_cols].values, steps=5)
    prediction_df = pd.DataFrame(prediction, columns=input_cols)
    
    # Calculate the Mean Absolute Percentage Error (MAPE) for each feature
    mape_sewer = np.mean(np.abs((test['Sanitation/Sewer Issue Average Time Open'].values - prediction[:,0]) / test['Sanitation/Sewer Issue Average Time Open'].values)) * 100
    mape_light = np.mean(np.abs((test['Street Light Repairs Average Time Open'].values - prediction[:,1]) / test['Street Light Repairs Average Time Open'].values)) * 100
    mape_street = np.mean(np.abs((test['Street Repairs Average Time Open'].values - prediction[:,2]) / test['Street Repairs Average Time Open'].values)) * 100
    mape_crime = np.mean(np.abs((test['Total Crime'].values - prediction[:,3]) / test['Total Crime'].values)) * 100
    
    results.append((name, mape_sewer, mape_light, mape_street, mape_crime))

# Convert the results to a dataframe
results_df = pd.DataFrame(results, columns=["Census Block Group", "MAPE (Sewer)", "MAPE (Light)", "MAPE (Street)", "MAPE (Crime)"])

# Print the results
print(results_df)

  Census Block Group  MAPE (Sewer)  MAPE (Light)  MAPE (Street)  MAPE (Crime)
0       170313201021           inf           inf            inf           inf
1       170313204001           inf           inf            inf     91.063569
2       170318391001           inf           inf            inf     90.312986
3       170318424001           inf           inf            inf     94.134304
4       170318429001           inf           inf            inf           inf


