In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import xgboost as xgb
from sklearn.model_selection import train_test_split
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/

In [2]:
path1 = 'https://gen10datafund2202.blob.core.windows.net/jedscontainer/911_Calls_For_Service.csv'
path2 = 'https://opendata.arcgis.com/api/v3/datasets/4f49eb825f564efa9a23cd103c4ba13b_0/downloads/data?format=csv&spatialRefId=4326'
start = time.perf_counter()
detroit_911 = pd.read_csv(path1, thousands = ",", nrows = 100000)
print(time.perf_counter() - start)

169.20148369999998


## Clean the Data

In [3]:
# Make a copy of the dataframe so that I don't have to redownload it every time I want to refresh.
detroit_df = detroit_911.copy()

# Count all the null values in the columns
null_values = detroit_911.isnull().sum()
#print('Null values prior to dropping columns.')
#print(null_values)

# Drop columns that will not be used in algorithm
detroit_df.drop(
    columns = [
        'incident_id',
        'zip_code',
        'oid',
        'respondingunit',
        'agency',
        'X',
        'Y',
        'longitude',
        'latitude',
        'incident_address',
        'block_id',
        'category'
    ], inplace = True)

#print('\nNull values after dropping columns')
#print(detroit_df.isnull().sum())


mapped_priorities = detroit_df.groupby(by = ['priority'])['calldescription'].apply(lambda x: ', '.join(x)).reset_index()
mapped_priorities['calldescription'] = mapped_priorities['calldescription'].apply(lambda x: ", ".join(_.strip() for _ in set(x.split(","))))

### Fix Missing Priorities

In [4]:
priority_dict = {}
for priority in mapped_priorities['priority'].unique():
    priority_dict[priority] = mapped_priorities[(mapped_priorities.priority == priority)]['calldescription'].values.tolist()[0]

del priority_dict[" "]

def get_priority(current_priority, call_descrip):
    if current_priority != " ":
        return current_priority
    for key in priority_dict.keys():
        if call_descrip in priority_dict[key]:
            return key
        
    return 'FAIL'

detroit_df['new_priority'] = detroit_df.apply(lambda row: get_priority(row.priority, row.calldescription), axis = 1)

detroit_df.drop(columns = ['priority'], inplace = True)
detroit_df.rename(columns = {'new_priority': 'priority'}, inplace = True)


In [5]:
# for column in detroit_df.columns:
#     print(f'{column}\nNumber of Values: {len(detroit_df[column].unique())}\n')

In [6]:
detroit_df.drop(columns = ['totaltime','totalresponsetime'], inplace = True)


### Convert timestamp to weekday

In [7]:
def return_weekday(time_stamp):
    time = dt.datetime.strptime(time_stamp, '%Y/%m/%d %H:%M:%S+00').date()
    return dt.datetime.strftime(time, '%A')
    
detroit_df['weekday'] = detroit_df['call_timestamp'].apply(lambda x: return_weekday(x))

In [8]:
detroit_df.drop(columns = ['callcode','calldescription','call_timestamp'], inplace = True)
detroit_df.drop(columns = ['precinct_sca','neighborhood'], inplace = True)
detroit_df.dropna(subset = ['time_on_scene'], inplace = True)

In [10]:
detroit_df.head()

Unnamed: 0,officerinitiated,intaketime,dispatchtime,traveltime,time_on_scene,council_district,priority,weekday
0,No,2.2,528.6,15.5,120.5,4.0,3,Tuesday
1,No,2.7,5.2,6.4,59.1,2.0,2,Tuesday
3,No,1.4,3.4,7.6,41.7,7.0,3,Tuesday
5,No,3.5,9.9,5.5,6.4,5.0,2,Tuesday
6,No,1.0,152.8,5.2,42.4,2.0,2,Tuesday


In [11]:
detroit_dummied = pd.get_dummies(detroit_df, drop_first = True)
detroit_dummied.shape

(84931, 18)

In [12]:
subset = detroit_dummied.head(1000)
subset.shape

(1000, 18)

In [35]:
print(len(subset['time_on_scene'].unique()))

569


In [17]:
## converting to numpy array
subset_np_target = subset[['time_on_scene']].to_numpy()
subset_np_data = subset.drop(columns = ['time_on_scene']).to_numpy()

In [19]:
X = subset_np_data
y = subset_np_target

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [21]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [44]:
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    #'objective': 'multi:softmax',  # error evaluation for multiclass training
    #'num_class': 569  # the number of classes that exist in this datset
}

In [45]:
bst = xgb.train(param, dtrain, 20)

Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [46]:
preds = bst.predict(dtest)
preds

array([ 25.208767,  46.47731 ,  31.656036,  42.800297,  43.659897,
        26.321358,  28.570234,  45.215878,  32.818287,  44.93826 ,
        34.76383 ,  31.656036,  55.503864,  36.140423,  24.361734,
        38.636513,  28.983158,  36.692875,  53.906277,  32.87214 ,
        37.288097,  33.527187,  42.949345,  40.23509 ,  44.098785,
        54.26494 ,  21.575718,  23.090122,  62.063953,  44.245724,
        31.58895 ,  43.659897,  45.44834 ,  37.288097,  37.288097,
        58.23614 ,  31.518864,  42.708286,  42.63111 ,  31.679808,
        54.26494 ,  28.825096,  61.67812 ,  29.281134,  46.710503,
        53.569233,  51.082302,  32.34802 ,  49.601707,  35.631653,
        32.088028,  43.659897,  47.128265,  42.042915, 141.98466 ,
        38.718666,  21.261738,  29.83158 ,  47.128265,  40.636   ,
        31.656036,  41.471157,  48.031086,   8.053561,  86.47774 ,
        36.194466,  52.547436,  34.20521 ,  56.360683,  20.536179,
        44.24915 ,  22.252169,  44.098785,  45.41593 ,  41.520

In [49]:
prediction_frame = pd.DataFrame(y_test, columns = ['ground_truth'])

In [52]:
prediction_frame['prediction'] = preds

In [54]:
prediction_frame

Unnamed: 0,ground_truth,prediction
0,18.9,25.208767
1,158.5,46.477310
2,1.3,31.656036
3,55.0,42.800297
4,30.7,43.659897
...,...,...
195,34.5,42.476906
196,4.1,40.235088
197,5.4,46.477310
198,40.6,41.520542
