In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
df = pd.read_csv('DSW_Project_Shooting_Cleaned.csv')

df.head()

Unnamed: 0,INCIDENT_KEY,NUM_VIC,OCCUR_DATE,OCCUR_TIME,BORO,LOC_OF_OCCUR_DESC,PRECINCT,JURISDICTION_CODE,LOC_CLASSFCTN_DESC,STATISTICAL_MURDER_FLAG,...,VIC_SEX,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,Borough Boundaries,City Council Districts,Police Precincts,Zip Codes,Community Districts
0,238531159.0,1,01/01/2022,01:12:00,1,1,34.0,0.0,2,N,...,1,1003170.0,248389.0,40.848427,-73.931611,4.0,39.0,22.0,13091.0,47.0
1,238531160.0,1,01/01/2022,05:20:00,1,1,10.0,0.0,2,N,...,1,985059.0,215225.0,40.757421,-73.997079,4.0,10.0,6.0,12080.0,12.0
2,238531161.0,1,01/01/2022,06:59:00,1,1,34.0,0.0,4,N,...,1,1008453.0,257174.0,40.872526,-73.912484,4.0,39.0,22.0,13092.0,47.0
3,238532487.0,1,01/01/2022,05:45:00,1,1,23.0,2.0,1,N,...,1,1001746.0,228590.0,40.794086,-73.936809,4.0,35.0,14.0,12426.0,7.0
4,238533195.0,1,01/01/2022,06:15:00,1,1,25.0,0.0,3,N,...,1,1000581.0,231070.0,40.800896,-73.941011,4.0,36.0,16.0,13093.0,7.0


In [3]:
# Select important features to predict number of NUM_VIC in a shooting incident and conduct data preprocessing

selected_cols = ["OCCUR_TIME", "BORO", "LOC_OF_OCCUR_DESC", "PERP_AGE_GROUP", "PERP_SEX", "NUM_VIC"]
num_vic = df['NUM_VIC']

selected_df = df.loc[:, selected_cols]

selected_df['OCCUR_TIME'] = pd.to_timedelta(selected_df['OCCUR_TIME']).dt.total_seconds()

selected_df['BORO'] = selected_df['BORO'].replace(1, "MANHATTAN")
selected_df['BORO'] = selected_df['BORO'].replace(2, "BRONX")
selected_df['BORO'] = selected_df['BORO'].replace(3, "BROOKLYN")
selected_df['BORO'] = selected_df['BORO'].replace(4, "QUEENS")
selected_df['BORO'] = selected_df['BORO'].replace(5, "STATEN ISLAND")

one_hot_encoded_boro = pd.get_dummies(selected_df['BORO'])
selected_df = pd.concat([selected_df, one_hot_encoded_boro], axis=1)
selected_df = selected_df.drop("BORO", axis = 1)

selected_df["LOC_OF_OCCUR_DESC"] = selected_df["LOC_OF_OCCUR_DESC"].replace(1, "OUTSIDE")
selected_df["LOC_OF_OCCUR_DESC"] = selected_df["LOC_OF_OCCUR_DESC"].replace(2, "INSIDE")

one_hot_encoded_loc = pd.get_dummies(selected_df['LOC_OF_OCCUR_DESC'])
selected_df = pd.concat([selected_df, one_hot_encoded_loc], axis=1)
selected_df = selected_df.drop("LOC_OF_OCCUR_DESC", axis = 1)

selected_df['PERP_SEX'] = selected_df['PERP_SEX'].replace(1, "MALE")
selected_df['PERP_SEX'] = selected_df['PERP_SEX'].replace(2, "FEMALE")

one_hot_encoded_sex = pd.get_dummies(selected_df['PERP_SEX'])
selected_df = pd.concat([selected_df, one_hot_encoded_sex], axis=1)
selected_df = selected_df.drop("PERP_SEX", axis = 1)
selected_df = selected_df.drop(-1, axis = 1)

selected_df['PERP_AGE_GROUP'] = selected_df['PERP_AGE_GROUP'].replace(1, "<18")
selected_df['PERP_AGE_GROUP'] = selected_df['PERP_AGE_GROUP'].replace(2, "18-24")
selected_df['PERP_AGE_GROUP'] = selected_df['PERP_AGE_GROUP'].replace(3, "25-44")
selected_df['PERP_AGE_GROUP'] = selected_df['PERP_AGE_GROUP'].replace(4, "45-64")
selected_df['PERP_AGE_GROUP'] = selected_df['PERP_AGE_GROUP'].replace(5, "65+")

one_hot_encoded_age = pd.get_dummies(selected_df['PERP_AGE_GROUP'])
selected_df = pd.concat([selected_df, one_hot_encoded_age], axis=1)
selected_df = selected_df.drop("PERP_AGE_GROUP", axis = 1)
selected_df = selected_df.drop(-1, axis = 1)

selected_df

Unnamed: 0,OCCUR_TIME,NUM_VIC,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN ISLAND,INSIDE,OUTSIDE,FEMALE,MALE,18-24,25-44,45-64,65+,<18
0,4320.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0
1,19200.0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0
2,25140.0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0
3,20700.0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0
4,22500.0,1,0,0,1,0,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288,75300.0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0
1289,59160.0,1,1,0,0,0,0,0,1,0,1,1,0,0,0,0
1290,61200.0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0
1291,78840.0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [4]:
# split the data into training and testing, and validation data set

train, test = train_test_split(selected_df, test_size=0.3)

train, validation = train_test_split(train, test_size=0.2)

train

Unnamed: 0,OCCUR_TIME,NUM_VIC,BRONX,BROOKLYN,MANHATTAN,QUEENS,STATEN ISLAND,INSIDE,OUTSIDE,FEMALE,MALE,18-24,25-44,45-64,65+,<18
797,81600.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
760,1140.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
799,16440.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
207,16680.0,1,0,0,1,0,0,0,1,0,1,0,1,0,0,0
1021,12360.0,4,0,1,0,0,0,0,1,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
895,7080.0,1,0,1,0,0,0,0,1,0,1,0,1,0,0,0
184,60600.0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0
913,120.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0
45,4020.0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0


In [5]:
y_train = train['NUM_VIC']
x_train = train.drop("NUM_VIC", axis = 1)

y_validation = validation["NUM_VIC"]
x_validation = validation.drop("NUM_VIC", axis = 1)

lr = LinearRegression()

lr.fit(x_train, y_train)
y_pred_validation = lr.predict(x_validation)
y_pred_train = lr.predict(x_train)

mse_validation = mean_squared_error(y_validation, y_pred_validation)
mae_validation = mean_absolute_error(y_validation, y_pred_validation)
r2_validation = r2_score(y_validation, y_pred_validation)

mse_train = mean_squared_error(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print("validation: ", mse_validation, mae_validation, r2_validation)
print("train: ", mse_train, mae_train, r2_train)

validation:  0.6898532030598694 0.5225132468634937 -0.046018734862740995
train:  0.8308863945115882 0.5287270934528447 0.03350278560252273


In [6]:
# The result appears to be underfitting, which is expected using linear regression model
# Now we fit the testing data with the model

y_test = test['NUM_VIC']
x_test = test.drop("NUM_VIC", axis = 1)

y_pred_test = lr.predict(x_test)

mse_test = mean_squared_error(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(mse_test, mae_test, r2_test)

0.7577089958746163 0.49668861961245847 -0.03638376830706003


In [7]:
# Even more underfitting is observed