In [22]:
import pandas as pd
import random

def read_sample_from(fname, n=1000):
    with open(fname,'r') as f:
        f_len = sum(1 for line in f) - 1
        
    skip = sorted(random.sample(range(1,f_len),f_len-n))
    
    return pd.read_csv(
        fname, 
        low_memory=False, usecols=range(27),
        skiprows=skip
    )

# df = read_sample_from('./data/311_Service_Requests_from_2011.csv', 10000)
df = pd.read_csv('./data/311_Service_Requests_from_2011.csv',  low_memory=False, usecols=range(27) )
print("File read complete: ", len(df), 'rows')

File read complete:  1917212 rows


In [None]:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import numpy as np

df.columns = ['Unique Key', 'Created Date', 'Closed Date', 'Agency', 'Agency Name',
       'Complaint Type', 'Descriptor', 'Location Type', 'Incident Zip',
       'Incident Address', 'Street Name', 'Cross Street 1', 'Cross Street 2',
       'Intersection Street 1', 'Intersection Street 2', 'Address Type',
       'City', 'Landmark', 'Facility Type', 'Status', 'Due Date',
       'Resolution Description', 'Resolution Action Updated Date',
       'Community Board', 'Borough', 'X', 'Y']

A = df[df.Status == 'Closed']
A = A[A['Closed Date'].notnull()]
A.fillna(0, inplace=True)
A['Closed Date'] = pd.to_datetime(A['Closed Date'], format='%m/%d/%Y %H:%M:%S %p')
A['Created Date'] = pd.to_datetime(A['Created Date'], format='%m/%d/%Y %H:%M:%S %p')
A['Created Month'] = A['Created Date'].apply( lambda x: x.month)

A['Response Time'] = (A['Closed Date'] - A['Created Date']) / np.timedelta64(1, 'h')

A = A[['Agency', 'Complaint Type', 'Location Type', 
       'City', 'Borough', 'Facility Type', 'Created Month', 'X', 'Y', 'Response Time']]

for col in A.columns:
    if A[col].dtype not in ['float64', 'int', 'int64']:
        dummies = pd.get_dummies(A[col])
        A[dummies.columns] = dummies

A = A[A['Response Time'] != 0]
A = A[A.columns[8:]]

train, test = train_test_split(A,test_size=0.2)
print("Number of training observations: ", len(train))
print("Number of features: ", len(train.columns))

Number of training observations:  1328493
Number of features:  1053


## XGboost time

First, we tune the model using GridSearch
```python
import xgboost as xgb
from sklearn.grid_search import GridSearchCV

cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': 0.1, 'n_estimators': 100, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'reg:linear'}

gbm = GridSearchCV(xgb.XGBRegressor(**ind_params), cv_params, scoring = 'r2', cv=5)
optimized_gbm = gbm.fit( train[train.columns[:-1]].values,  train['Response Time'].values)
print(optimized_gbm.grid_scores_)
```

From this, we find that the optimal depth is 3, and child_weight is 3

In [None]:
import xgboost as xgb
gbm = xgb.XGBRegressor(learning_rate=0.05, n_estimators=100, max_depth=3, min_child_weight=3)
gbm.fit( train[train.columns[:-1]].values,  train['Response Time'].values)
gbm.score(test[test.columns[:-1]].values, test['Response Time'].values)