In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

## Load San Francisco data

In [2]:
import pandas as pd
import pyarrow.parquet as pq

df_san_francisco = pd.read_parquet("safety-SanFrancisco-2.parquet", engine="pyarrow")

<IPython.core.display.Javascript object>

In [3]:
df_san_francisco.head()

Unnamed: 0,requestId,dataSubtype,dateTime,category,subcategory,status,address,latitude,longitude,source,Date,year,weekday,month,hour,month_name,neighborhood
0,1,911_Fire,2018-10-02 11:54:01,Potentially Life-Threatening,Medical Incident,,ELM ST/FRANKLIN ST,37.781286,-122.422187,,2018-10-02,2018,1,10,11,October,Tenderloin
1,2,311_All,2018-07-08 15:00:27,Street and Sidewalk Cleaning,Bulky Items,Closed,"1536 SACRAMENTO ST, SAN FRANCISCO, CA, 94109",37.791887,-122.418188,Mobile/Open311,2018-07-08,2018,6,7,15,July,Nob Hill
2,3,311_All,2016-06-28 13:12:28,General Request - COUNTY CLERK,customer_callback,Closed,Not associated with a specific address,0.0,0.0,Phone,2016-06-28,2016,1,6,13,June,
3,7,311_All,2017-03-03 09:34:49,Temporary Sign Request,Temporary Sign Request for Other_Event_Type,Closed,"2190 NORTH POINT ST, SAN FRANCISCO, CA, 94123",37.802853,-122.443245,Phone,2017-03-03,2017,4,3,9,March,Marina
4,10,911_Fire,2016-10-25 18:33:20,Potentially Life-Threatening,Medical Incident,,200 Block of DALEWOOD WAY,37.737953,-122.456498,,2016-10-25,2016,1,10,18,October,Portola


<IPython.core.display.Javascript object>

In [21]:
df_san_francisco.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3221561 entries, 0 to 3958888
Data columns (total 17 columns):
 #   Column        Dtype         
---  ------        -----         
 0   requestId     int64         
 1   dataSubtype   category      
 2   dateTime      datetime64[ns]
 3   category      category      
 4   subcategory   category      
 5   status        category      
 6   address       object        
 7   latitude      float64       
 8   longitude     float64       
 9   source        category      
 10  Date          datetime64[ns]
 11  year          int64         
 12  weekday       int64         
 13  month         int64         
 14  hour          int64         
 15  month_name    object        
 16  neighborhood  object        
dtypes: category(5), datetime64[ns](2), float64(2), int64(5), object(3)
memory usage: 338.0+ MB


<IPython.core.display.Javascript object>

## Feature Engineering

### We will be training a model using safety events corresponding to the “Potentially Life-Threatening” category, per neighborhood, and for each 6 hours interval. Only the top 5 neighborhoods with the highest number of events are considered

In [6]:
df_san_francisco[df_san_francisco["category"] == "Potentially Life-Threatening"][
    "neighborhood"
].value_counts().nlargest(5).index

Index(['Tenderloin', 'SoMa', 'Nob Hill', 'Mission', 'Bayview'], dtype='object')

<IPython.core.display.Javascript object>

In [35]:
df_sf_pt_life_threatening = df_san_francisco[
    (df_san_francisco["category"] == "Potentially Life-Threatening")
]

<IPython.core.display.Javascript object>

In [36]:
df_sf_pt_life_threatening["category"].unique()

['Potentially Life-Threatening']
Categories (1, object): ['Potentially Life-Threatening']

<IPython.core.display.Javascript object>

In [37]:
df_sf_pt_life_threatening["year"].unique()

array([2018, 2016, 2019, 2017, 2020], dtype=int64)

<IPython.core.display.Javascript object>

In [38]:
df_new = df_sf_pt_life_threatening[
    df_sf_pt_life_threatening["neighborhood"].isin(
        df_sf_pt_life_threatening["neighborhood"].value_counts().nlargest(5).index
    )
]

<IPython.core.display.Javascript object>

In [39]:
df_new.neighborhood.unique()

array(['Tenderloin', 'Mission', 'SoMa', 'Bayview', 'Nob Hill'],
      dtype=object)

<IPython.core.display.Javascript object>

In [40]:
df_new = (
    df_new.groupby(
        [pd.Grouper(key="dateTime", freq="6H"), pd.Grouper(key="neighborhood")]
    )["requestId"]
    .count()
    .reset_index(name="count")
)

<IPython.core.display.Javascript object>

In [41]:
df_new.head(10)

Unnamed: 0,dateTime,neighborhood,count
0,2016-01-01 00:00:00,Bayview,2
1,2016-01-01 00:00:00,Mission,10
2,2016-01-01 00:00:00,Nob Hill,6
3,2016-01-01 00:00:00,SoMa,13
4,2016-01-01 00:00:00,Tenderloin,15
5,2016-01-01 06:00:00,Bayview,1
6,2016-01-01 06:00:00,Mission,2
7,2016-01-01 06:00:00,Nob Hill,4
8,2016-01-01 06:00:00,SoMa,5
9,2016-01-01 06:00:00,Tenderloin,6


<IPython.core.display.Javascript object>

### Add year, month, day, day of week and hour to new dataset

In [42]:
df_new["year"] = df_new["dateTime"].dt.year
df_new["weekday"] = df_new["dateTime"].dt.weekday
df_new["month"] = df_new["dateTime"].dt.month
df_new["hour"] = df_new["dateTime"].dt.hour

<IPython.core.display.Javascript object>

In [43]:
df_new.head(10)

Unnamed: 0,dateTime,neighborhood,count,year,weekday,month,hour
0,2016-01-01 00:00:00,Bayview,2,2016,4,1,0
1,2016-01-01 00:00:00,Mission,10,2016,4,1,0
2,2016-01-01 00:00:00,Nob Hill,6,2016,4,1,0
3,2016-01-01 00:00:00,SoMa,13,2016,4,1,0
4,2016-01-01 00:00:00,Tenderloin,15,2016,4,1,0
5,2016-01-01 06:00:00,Bayview,1,2016,4,1,6
6,2016-01-01 06:00:00,Mission,2,2016,4,1,6
7,2016-01-01 06:00:00,Nob Hill,4,2016,4,1,6
8,2016-01-01 06:00:00,SoMa,5,2016,4,1,6
9,2016-01-01 06:00:00,Tenderloin,6,2016,4,1,6


<IPython.core.display.Javascript object>

### Label encode neighborhood

In [48]:
from sklearn.preprocessing import LabelEncoder


labelencoder = LabelEncoder()

df_new["neighborhood_LE"] = labelencoder.fit_transform(df_new["neighborhood"])

<IPython.core.display.Javascript object>

### Model Training

In [45]:
from sklearn.linear_model import Ridge

<IPython.core.display.Javascript object>

In [78]:
X = df_new.drop(columns=["neighborhood", "dateTime"])
X = X[X["year"] != 2020]

<IPython.core.display.Javascript object>

In [79]:
X[(X["year"] == 2019) & (X["month"] > 5)].shape[0] / X.shape[0]

0.14771054049200935

<IPython.core.display.Javascript object>

#### Since dataset has time element using regular train_test_split from scikitlearn does not help. Data must be split based on time intervals

In [81]:
X_test = X[(X["year"] == 2019) & (X["month"] > 5)] # 15% of dataset used for training
X_train = X.drop(labels=X_test.index)

y_test = X_test[X_test["year"] != 2020]["count"]
y_train = X_train[X_train["year"] != 2020]["count"]

X_test = X_test.drop(columns=["count"])
X_train = X_train.drop(columns=["count"])

<IPython.core.display.Javascript object>

In [82]:
X_test.shape

(4113, 5)

<IPython.core.display.Javascript object>

In [83]:
X_train.shape

(23732, 5)

<IPython.core.display.Javascript object>

In [85]:
y_test.shape

(4113,)

<IPython.core.display.Javascript object>

In [86]:
y_train.shape

(23732,)

<IPython.core.display.Javascript object>

### Train simple Ridge Regression

In [95]:
ridge_regr = Ridge()
ridge_regr.fit(X_train, y_train)

Ridge()

<IPython.core.display.Javascript object>

### Evaluate Ridge Regression model

In [96]:
from sklearn.metrics import max_error, mean_absolute_error, r2_score

<IPython.core.display.Javascript object>

In [97]:
y_pred = ridge_regr.predict(X_test)

<IPython.core.display.Javascript object>

In [98]:
mean_absolute_error(y_test, y_pred)

2.065011500157309

<IPython.core.display.Javascript object>

In [99]:
r2_score(y_test, y_pred)

0.342625006074569

<IPython.core.display.Javascript object>

In [100]:
max_error(y_test, y_pred)

21.610053374772093

<IPython.core.display.Javascript object>