### Forecasting the Number of Future Service Requests

As requests vary seasonally, we plan to analyze past data on the frequency and types of requests in previous years or months to identify recurring patterns or anomalies. These forecasts will enable municipalities and service departments to anticipate periods of higher demand and plan accordingly. The outcome variable used would be the number of future service requests in a specific time frame, categorised by request type and origin

### Import Libraries

In [275]:
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline 
import matplotlib.pyplot as plt
import plotly.express as px
import sklearn
import statsmodels
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [276]:
clean_data = pd.read_csv("../Data/CleanedData_311_Dataset.csv")
clean_data.shape

(365567, 16)

In [277]:
clean_data.columns

Index(['num_requests', 'parent_closed', 'status_name', 'status_code', 'dept',
       'request_type_name', 'create_date_et', 'last_action_et',
       'closed_date_et', 'origin', 'city', 'neighborhood', 'census_tract',
       'council_district', 'ward', 'police_zone'],
      dtype='object')

In [278]:
clean_data.head()

Unnamed: 0,num_requests,parent_closed,status_name,status_code,dept,request_type_name,create_date_et,last_action_et,closed_date_et,origin,city,neighborhood,census_tract,council_district,ward,police_zone
0,1,f,open,0,Police - Zones 1-6,Excessive Noise/Disturbances,2022-04-24T09:56:00,2022-04-24T09:56:00,,Website,Pittsburgh,Central Business District,42003020000.0,6.0,2.0,2.0
1,1,f,in progress,3,DPW - Park Maintenance,Field,2021-10-31T09:13:00,2021-11-30T09:21:00,,Website,Pittsburgh,Highland Park,42003980000.0,7.0,11.0,5.0
2,1,f,open,0,Police - AVU,Abandoned Vehicle (parked on street),2022-04-01T07:33:00,2022-04-01T07:33:00,,Website,Pittsburgh,Highland Park,42003980000.0,7.0,11.0,5.0
3,1,f,open,0,Police - Zones 1-6,Excessive Noise/Disturbances,2022-06-05T14:43:00,2022-06-05T14:43:00,,Website,Pittsburgh,Highland Park,42003980000.0,7.0,11.0,5.0
4,1,f,open,0,DOMI - Traffic,"Pavement Marking, New",2021-09-30T11:42:00,2021-09-30T11:42:00,,Website,Pittsburgh,Perry South,42003260000.0,6.0,26.0,1.0


In [279]:
# Convert date columns to DateTime format
clean_data['create_date_et'] = pd.to_datetime(clean_data['create_date_et'])
clean_data['last_action_et'] = pd.to_datetime(clean_data['last_action_et'])
clean_data['closed_date_et'] = pd.to_datetime(clean_data['closed_date_et'])

### Creating Season Column

Winter: December, January, February 
Spring: March, April, May 
Summer: June, July, August
Fall: September, October, November

In [280]:
# Extract the month from 'create_date_et'
clean_data['month'] = clean_data['create_date_et'].dt.month

# Function to categorize the month into seasons
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

# Create the 'season' column based on the month
clean_data['season'] = clean_data['month'].apply(get_season)

clean_data['season'].value_counts()

season
Summer    110106
Spring     88781
Winter     87522
Fall       79158
Name: count, dtype: int64

#### Data Aggregation (by Seasons and Neighborbood)

In [281]:
# Group by month and request type to get the count of requests
season_requests_req = clean_data.groupby(
    ['season', 'neighborhood', 'request_type_name', 'origin']
).size().reset_index(name='request_count')
season_requests_req.head()

Unnamed: 0,season,neighborhood,request_type_name,origin,request_count
0,Fall,Allegheny Center,Abandoned Vehicle (parked on street),Call Center,6
1,Fall,Allegheny Center,Abandoned Vehicle (parked on street),Report2Gov Android,1
2,Fall,Allegheny Center,Abandoned Vehicle (parked on street),Website,6
3,Fall,Allegheny Center,Americans with Disabilities,Website,1
4,Fall,Allegheny Center,Bicycle Parking,Website,1


In [282]:
season_model_neighborhood = pd.get_dummies(season_requests_req, 
                          columns=['season', 'neighborhood', 'request_type_name', 'origin'], 
                          drop_first=True, 
                          sparse=True)

In [283]:
season_model_neighborhood.head()

Unnamed: 0,request_count,season_Spring,season_Summer,season_Winter,neighborhood_Allegheny West,neighborhood_Allentown,neighborhood_Arlington,neighborhood_Arlington Heights,neighborhood_Banksville,neighborhood_Bedford Dwellings,...,request_type_name_Wires,request_type_name_Zoning Issue,origin_Control Panel,origin_Email,origin_Report2Gov Android,origin_Report2Gov Website,origin_Report2Gov iOS,origin_Text Message,origin_Twitter,origin_Website
0,6,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
2,6,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,1,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


### Modeling

In [284]:
from sklearn.model_selection import train_test_split

X = season_model_neighborhood.drop(columns=['request_count'])
y = season_model_neighborhood['request_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [285]:
# Baseline Model (Mean of y_train)
y_baseline = np.full_like(y_train, y_train.mean())

# RMSE for Baseline Model
baseline_rmse_season_neighborhood = mean_squared_error(y_train, y_baseline)
baseline_rmse_season_neighborhood = np.sqrt(baseline_rmse_season_neighborhood)
print(f"Baseline RMSE (Mean Model): {baseline_rmse_season_neighborhood}")

Baseline RMSE (Mean Model): 18.435248766249266


In [286]:
# Linear Regression Model
from sklearn.linear_model import LinearRegression

pipeline = make_pipeline(LinearRegression())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_linear_neighborhood_season = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_linear_neighborhood_season}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_linear_neighborhood_season.mean()}")

RMSE scores for each fold: [13.89085904 14.56128965 19.06429793 17.79502479 20.10367822]
Mean RMSE: 17.08302992492346


In [287]:
# Decision Tree Model
from sklearn.tree import DecisionTreeRegressor

pipeline = make_pipeline(DecisionTreeRegressor())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_dtree_neighborhood_season = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_dtree_neighborhood_season}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_dtree_neighborhood_season.mean()}")

RMSE scores for each fold: [32.06125145 27.24904693 25.12550969 31.08499002 27.56311498]
Mean RMSE: 28.61678261447882


In [288]:
# XGboost
from xgboost import XGBRegressor

pipeline = make_pipeline(XGBRegressor())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_xgboost_neighborhood_season = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_xgboost_neighborhood_season}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_xgboost_neighborhood_season.mean()}")



RMSE scores for each fold: [15.65020769 16.00345479 16.58520334 21.15487094 19.97690088]
Mean RMSE: 17.87412752692932




#### Data Aggregation (by Seasons and Department)

In [289]:
# Group by month and request type to get the count of requests
season_requests_req = clean_data.groupby(
    ['season', 'dept', 'request_type_name', 'origin']
).size().reset_index(name='request_count')
season_requests_req.head()

Unnamed: 0,season,dept,request_type_name,origin,request_count
0,Fall,311,Abandoned Vehicle (parked on street),Call Center,1
1,Fall,311,Accessible Parking Application,Call Center,52
2,Fall,311,Accessible Parking Application,Control Panel,3
3,Fall,311,Accessible Parking Application,Website,1
4,Fall,311,Building Maintenance,Call Center,1


In [290]:
season_model_dept = pd.get_dummies(season_requests_req, 
                          columns=['season', 'dept', 'request_type_name', 'origin'], 
                          drop_first=True, 
                          sparse=True)

### Modeling

In [291]:
X = season_model_dept.drop(columns=['request_count'])
y = season_model_dept['request_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [292]:
# Baseline Model (Mean of y_train)
y_baseline = np.full_like(y_train, y_train.mean())

# RMSE for Baseline Model
baseline_rmse_season_dept = mean_squared_error(y_train, y_baseline)
baseline_rmse_season_dept = np.sqrt(baseline_rmse_season_dept)
print(f"Baseline RMSE (Mean Model): {baseline_rmse_season_dept}")

Baseline RMSE (Mean Model): 325.1954446255484


In [293]:
# Linear Regression Model

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

pipeline = make_pipeline(LinearRegression())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_linear_dept_season = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_linear_dept_season}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_linear_dept_season.mean()}")

RMSE scores for each fold: [459.63598214 369.67049627 218.33335407 213.74372611 187.2153223 ]
Mean RMSE: 289.71977617908425


In [294]:
# Decision Tree Model
from sklearn.tree import DecisionTreeRegressor

pipeline = make_pipeline(DecisionTreeRegressor())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_dtree_dept_season = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_dtree_dept_season}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_dtree_dept_season.mean()}")

RMSE scores for each fold: [840.52511739 471.89919732 673.74937611 562.38622988 580.46428112]
Mean RMSE: 625.804840363448


In [295]:
# XGboost
from xgboost import XGBRegressor

pipeline = make_pipeline(XGBRegressor())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_xgboost_dept_season = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_xgboost_dept_season}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_xgboost_dept_season.mean()}")



RMSE scores for each fold: [630.40250039 326.25234673 589.58841576 477.1219544  395.7005852 ]
Mean RMSE: 483.8131604978531


#### Data Aggregation (by Month and Neighborbood)

In [296]:
# Group by month and request type to get the count of requests
month_requests_req = clean_data.groupby(
    ['month', 'neighborhood', 'request_type_name', 'origin']
).size().reset_index(name='request_count')
month_requests_req.head()

Unnamed: 0,month,neighborhood,request_type_name,origin,request_count
0,1,Allegheny Center,Angle Iron,Website,1
1,1,Allegheny Center,Bicycle/Pedestrian Concerns,Call Center,1
2,1,Allegheny Center,Blocked or Closed Sidewalks,Email,1
3,1,Allegheny Center,Blocked or Closed Trails,Email,1
4,1,Allegheny Center,Blue Bin Containers,Call Center,2


In [297]:
month_model_neighborhood = pd.get_dummies(month_requests_req, 
                          columns=['month', 'neighborhood', 'request_type_name', 'origin'], 
                          drop_first=True, 
                          sparse=True)

#### Modelling

In [298]:
X = month_model_neighborhood.drop(columns=['request_count'])
y = month_model_neighborhood['request_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [299]:
# Baseline Model (Mean of y_train)
y_baseline = np.full_like(y_train, y_train.mean())

# RMSE for Baseline Model
baseline_rmse_month_neighborhood = mean_squared_error(y_train, y_baseline)
baseline_rmse_month_neighborhood = np.sqrt(baseline_rmse_month_neighborhood)
print(f"Baseline RMSE (Mean Model): {baseline_rmse_month_neighborhood}")

Baseline RMSE (Mean Model): 8.927085949382109


In [300]:
# Linear Regression Model
pipeline = make_pipeline(LinearRegression())

# Perform 5 fold cross-validation 
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_linear_dept_month = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_linear_dept_month}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_linear_dept_month.mean()}")

RMSE scores for each fold: [8.08201986 9.60648654 8.34110057 8.04002238 7.43722379]
Mean RMSE: 8.301370627997565


In [301]:
# Decision Tree 
pipeline = make_pipeline(DecisionTreeRegressor())

# Perform 5 fold cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_dtree_dept_month = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_dtree_dept_month}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_dtree_dept_month.mean()}")

RMSE scores for each fold: [11.17159931 13.81579173 13.19050409 12.71515607 14.82313401]
Mean RMSE: 13.143237041095645


In [302]:
# XGboost
pipeline = make_pipeline(XGBRegressor())

# Perform 5 fold cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_xgboost_dept_month = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_xgboost_dept_month}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_xgboost_dept_month.mean()}")



RMSE scores for each fold: [6.88848249 8.27206522 7.03012252 6.54248133 6.9078914 ]
Mean RMSE: 7.128208590778025


#### Data Aggregation (by Month and Department)

In [303]:
# Group by month and request type to get the count of requests
month_requests_req = clean_data.groupby(
    ['month', 'dept', 'request_type_name', 'origin']
).size().reset_index(name='request_count')
month_requests_req.head()

Unnamed: 0,month,dept,request_type_name,origin,request_count
0,1,311,Abandoned Vehicle (parked on street),Call Center,1
1,1,311,Accessible Parking Application,Call Center,13
2,1,311,Building Maintenance,Website,1
3,1,311,City Cuts Concern,Call Center,1
4,1,311,City Cuts Concern,Website,2


In [304]:
month_model_dept = pd.get_dummies(month_requests_req, 
                          columns=['month', 'dept', 'request_type_name', 'origin'], 
                          drop_first=True, 
                          sparse=True)

#### Modeling

In [305]:
X = month_model_dept.drop(columns=['request_count'])
y = month_model_dept['request_count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [306]:
# Baseline Model (Mean of y_train)
y_baseline = np.full_like(y_train, y_train.mean())

# RMSE for Baseline Model
baseline_rmse_month_dept = mean_squared_error(y_train, y_baseline)
baseline_rmse_month_dept = np.sqrt(baseline_rmse_month_dept)
print(f"Baseline RMSE (Mean Model): {baseline_rmse_month_dept}")

Baseline RMSE (Mean Model): 121.70518945442578


In [307]:
# Linear Regression Model
pipeline = make_pipeline(LinearRegression())

# Perform 5 fold cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_linear_dept_month = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_linear_dept_month}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_linear_dept_month.mean()}")

RMSE scores for each fold: [134.68900869 107.59907001  79.11730687 103.86682553 120.51602487]
Mean RMSE: 109.15764719355548


In [308]:
# Decision Tree Model
pipeline = make_pipeline(DecisionTreeRegressor())

# Perform 5 fold cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_dtree_dept_month = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_dtree_dept_month}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_dtree_dept_month.mean()}")

RMSE scores for each fold: [226.975152   220.18418373 217.72293361 211.13726116 187.37564512]
Mean RMSE: 212.67903512489437


In [309]:
# XGboost
pipeline = make_pipeline(XGBRegressor())

# Perform 5 fold cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

rmse_scores_xgboost_dept_month = np.sqrt(-scores)

# Print RMSE for each fold
print(f"RMSE scores for each fold: {rmse_scores_xgboost_dept_month}")
# Print mean RMSE
print(f"Mean RMSE: {rmse_scores_xgboost_dept_month.mean()}")



RMSE scores for each fold: [125.39185066 132.35156042 100.38946619 125.88826581 119.07122671]
Mean RMSE: 120.61847395906086




### Consolidating RMSE

In [310]:
# Comparing RMSE of different models
rmse_scores = {
    'Linear Neighborhood Season': rmse_scores_linear_neighborhood_season,
    'Decision Tree Neighborhood Season': rmse_scores_dtree_neighborhood_season,
    'XGBoost Neighborhood Season': rmse_scores_xgboost_neighborhood_season,
    'Linear Department Season': rmse_scores_linear_dept_season,
    'Decision Tree Department Season': rmse_scores_dtree_dept_season,
    'XGBoost Department Season': rmse_scores_xgboost_dept_season,
    'Linear Neighborhood Month': rmse_scores_linear_dept_month,
    'Decision Tree Neighborhood Month': rmse_scores_dtree_dept_month,
    'XGBoost Neighborhood Month': rmse_scores_xgboost_dept_month,
    'Linear Department Month': rmse_scores_linear_dept_month,
    'Decision Tree Department Month': rmse_scores_dtree_dept_month,
    'XGBoost Department Month': rmse_scores_xgboost_dept_month
}
# Convert to DataFrame for better visualization
rmse_df = pd.DataFrame(rmse_scores)
# Calculate mean RMSE for each model
mean_rmse = {key: np.mean(value) for key, value in rmse_scores.items()}
# Create a DataFrame for better visualization
rmse_summary = pd.DataFrame([
    {'Grouping': 'Season', 'Type': 'Neighborhood', 'Model': 'Linear', 'Mean RMSE': mean_rmse['Linear Neighborhood Season'], 'Better than Baseline': mean_rmse['Linear Neighborhood Season'] < baseline_rmse_season_neighborhood},
    {'Grouping': 'Season', 'Type': 'Neighborhood', 'Model': 'Decision Tree', 'Mean RMSE': mean_rmse['Decision Tree Neighborhood Season'], 'Better than Baseline': mean_rmse['Decision Tree Neighborhood Season'] < baseline_rmse_season_neighborhood},
    {'Grouping': 'Season', 'Type': 'Neighborhood', 'Model': 'XGBoost', 'Mean RMSE': mean_rmse['XGBoost Neighborhood Season'], 'Better than Baseline': mean_rmse['XGBoost Neighborhood Season'] < baseline_rmse_season_neighborhood},
    {'Grouping': 'Season', 'Type': 'Department', 'Model': 'Linear', 'Mean RMSE': mean_rmse['Linear Department Season'], 'Better than Baseline': mean_rmse['Linear Department Season'] < baseline_rmse_season_dept},
    {'Grouping': 'Season', 'Type': 'Department', 'Model': 'Decision Tree', 'Mean RMSE': mean_rmse['Decision Tree Department Season'], 'Better than Baseline': mean_rmse['Decision Tree Department Season'] < baseline_rmse_season_dept},
    {'Grouping': 'Season', 'Type': 'Department', 'Model': 'XGBoost', 'Mean RMSE': mean_rmse['XGBoost Department Season'], 'Better than Baseline': mean_rmse['XGBoost Department Season'] < baseline_rmse_season_dept},
    {'Grouping': 'Month', 'Type': 'Neighborhood', 'Model': 'Linear', 'Mean RMSE': mean_rmse['Linear Neighborhood Month'], 'Better than Baseline': mean_rmse['Linear Neighborhood Month'] < baseline_rmse_month_neighborhood},
    {'Grouping': 'Month', 'Type': 'Neighborhood', 'Model': 'Decision Tree', 'Mean RMSE': mean_rmse['Decision Tree Neighborhood Month'], 'Better than Baseline': mean_rmse['Decision Tree Neighborhood Month'] < baseline_rmse_month_neighborhood},
    {'Grouping': 'Month', 'Type': 'Neighborhood', 'Model': 'XGBoost', 'Mean RMSE': mean_rmse['XGBoost Neighborhood Month'], 'Better than Baseline': mean_rmse['XGBoost Neighborhood Month'] < baseline_rmse_month_neighborhood},
    {'Grouping': 'Month', 'Type': 'Department', 'Model': 'Linear', 'Mean RMSE': mean_rmse['Linear Department Month'], 'Better than Baseline': mean_rmse['Linear Department Month'] < baseline_rmse_month_dept},
    {'Grouping': 'Month', 'Type': 'Department', 'Model': 'Decision Tree', 'Mean RMSE': mean_rmse['Decision Tree Department Month'], 'Better than Baseline': mean_rmse['Decision Tree Department Month'] < baseline_rmse_month_dept},
    {'Grouping': 'Month', 'Type': 'Department', 'Model': 'XGBoost', 'Mean RMSE': mean_rmse['XGBoost Department Month'], 'Better than Baseline': mean_rmse['XGBoost Department Month'] < baseline_rmse_month_dept}
])

rmse_summary

Unnamed: 0,Grouping,Type,Model,Mean RMSE,Better than Baseline
0,Season,Neighborhood,Linear,17.08303,True
1,Season,Neighborhood,Decision Tree,28.616783,False
2,Season,Neighborhood,XGBoost,17.874128,True
3,Season,Department,Linear,289.719776,True
4,Season,Department,Decision Tree,625.80484,False
5,Season,Department,XGBoost,483.81316,False
6,Month,Neighborhood,Linear,109.157647,False
7,Month,Neighborhood,Decision Tree,212.679035,False
8,Month,Neighborhood,XGBoost,120.618474,False
9,Month,Department,Linear,109.157647,True
