In [1]:
#import libraries
%matplotlib inline
import pandas as pd
import numpy as np
import re
from geopy.geocoders import Nominatim
from sklearn import preprocessing, cross_validation, linear_model, tree, ensemble, metrics, feature_extraction

In [2]:
df = pd.read_csv("train_911.csv", index_col='id')
df.head()

Unnamed: 0_level_0,lat,lng,desc,zip,timeStamp,twp,addr,e,emergencytype,emergencysubtype
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,40.297876,-75.581294,REINDEER CT & DEAD END; NEW HANOVER; Station ...,19525.0,2015-12-10 17:40:00,NEW HANOVER,REINDEER CT & DEAD END,1,EMS,BACK PAINS/INJURY
1,40.258061,-75.26468,BRIAR PATH & WHITEMARSH LN; HATFIELD TOWNSHIP...,19446.0,2015-12-10 17:40:00,HATFIELD TOWNSHIP,BRIAR PATH & WHITEMARSH LN,1,EMS,DIABETIC EMERGENCY
2,40.121182,-75.351975,HAWS AVE; NORRISTOWN; 2015-12-10 @ 14:39:21-St...,19401.0,2015-12-10 17:40:00,NORRISTOWN,HAWS AVE,1,Fire,GAS-ODOR/LEAK
3,40.116153,-75.343513,AIRY ST & SWEDE ST; NORRISTOWN; Station 308A;...,19401.0,2015-12-10 17:40:01,NORRISTOWN,AIRY ST & SWEDE ST,1,EMS,CARDIAC EMERGENCY
4,40.251492,-75.60335,CHERRYWOOD CT & DEAD END; LOWER POTTSGROVE; S...,,2015-12-10 17:40:01,LOWER POTTSGROVE,CHERRYWOOD CT & DEAD END,1,EMS,DIZZINESS


In [3]:
#we have to delete the desc column as it includes information on the emergency type
del df['desc']

In [4]:
#check for NANs
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 77691 entries, 0 to 77690
Data columns (total 9 columns):
lat                 77691 non-null float64
lng                 77691 non-null float64
zip                 67675 non-null float64
timeStamp           77691 non-null object
twp                 77657 non-null object
addr                77277 non-null object
e                   77691 non-null int64
emergencytype       77691 non-null object
emergencysubtype    77691 non-null object
dtypes: float64(3), int64(1), object(5)
memory usage: 5.9+ MB


In [5]:
#We have NANs in ZIP, twp and addr

In [6]:
df.groupby('twp')['zip'].value_counts(dropna=False)

twp              zip    
ABINGTON         19046.0    1382
                 19001.0    1131
                 19038.0     749
                 19090.0     521
                 19006.0     348
                 19027.0     241
                 NaN         237
                 19095.0      23
                 19111.0      16
                 19025.0       7
                 19040.0       5
                 19002.0       3
                 19012.0       3
                 19115.0       3
AMBLER           19002.0     459
                 NaN          17
                 19031.0       1
BERKS COUNTY     NaN          89
                 19512.0      74
                 19518.0      44
                 19504.0      10
                 19505.0       4
                 18056.0       3
                 19503.0       3
                 18092.0       2
                 19464.0       1
BRIDGEPORT       19405.0     298
                 NaN          91
                 19406.0      41
                 1

In [7]:
#This returns the full address from a pair of coordinates - we can use this to get the missing zip/twp data since we have lat/lng for all

def get_address(lat, lng):
    
    geolocator = Nominatim()
    location = geolocator.reverse([lat,lng])
    return location.address

In [8]:
#example:
get_address(df.lat[0], df.lng[0])

u'2904, Lutheran Road, New Hanover Township, Montgomery County, Pennsylvania, 19525, United States of America'

In [9]:
#Before we worry about the missing data let's quickly check the test file
df_test = pd.read_csv("test_911.csv", index_col='id')
df_test.head()

Unnamed: 0_level_0,lat,lng,desc,zip,timeStamp,twp,addr,e
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
77691,40.095171,-75.414699,1ST AVE & FREEDOM DR; UPPER MERION; Station 3...,19406.0,2016-06-29 14:37:02,UPPER MERION,1ST AVE & FREEDOM DR,1
77692,40.130383,-75.482545,LONGFORD RD & PORT PROVIDENCE RD; UPPER PROVID...,19460.0,2016-06-29 14:41:00,UPPER PROVIDENCE,LONGFORD RD & PORT PROVIDENCE RD,1
77693,40.244012,-75.61548,BEECH ST & HIGH ST; POTTSTOWN; 2016-06-29 @ 14...,19464.0,2016-06-29 14:41:01,POTTSTOWN,BEECH ST & HIGH ST,1
77694,40.106577,-75.314605,MANOR AVE & ZOAR RD; PLYMOUTH; Station 308; 2...,19462.0,2016-06-29 14:42:00,PLYMOUTH,MANOR AVE & ZOAR RD,1
77695,40.182374,-75.104914,YORK RD & VILLAGE PL; HATBORO; 2016-06-29 @ 14...,19040.0,2016-06-29 14:42:01,HATBORO,YORK RD & VILLAGE PL,1


In [10]:
#we have to delete the desc column as it includes information on the emergency type
del df_test['desc']
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19423 entries, 77691 to 97113
Data columns (total 7 columns):
lat          19423 non-null float64
lng          19423 non-null float64
zip          16923 non-null float64
timeStamp    19423 non-null object
twp          19416 non-null object
addr         19333 non-null object
e            19423 non-null int64
dtypes: float64(3), int64(1), object(3)
memory usage: 1.2+ MB


In [11]:
#so we also have missing zip, twp and addr data

In [12]:
#Convert datetime
df.timeStamp = pd.to_datetime(df.timeStamp)
df_test.timeStamp = pd.to_datetime(df_test.timeStamp)

In [13]:
#Let's try model just using lat/lng and timestamp since none of this is missing
X = df[['lat','lng',]]

#let's try changing the timestamp into hour and day and month as separate features
X = pd.concat([X, df.timeStamp.dt.dayofweek, df.timeStamp.dt.hour, df.timeStamp.dt.month], axis=1)

In [14]:
X.head()

Unnamed: 0_level_0,lat,lng,timeStamp,timeStamp,timeStamp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,40.297876,-75.581294,3,17,12
1,40.258061,-75.26468,3,17,12
2,40.121182,-75.351975,3,17,12
3,40.116153,-75.343513,3,17,12
4,40.251492,-75.60335,3,17,12


In [15]:
#emergencytype is our target
y = df.emergencytype

In [16]:
#Train test split
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [17]:
#let's try logistic first
log_reg = linear_model.LogisticRegression()

In [18]:
# sample weights (we mostly have EMS in our data)
weights = dict(y_train.value_counts()/y_train.count())
weights

{'EMS': 0.48888209550778738,
 'Fire': 0.14815291543313167,
 'Traffic': 0.36296498905908098}

In [19]:
#fit and predict
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)

In [20]:
#metrics
print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        EMS       0.49      0.99      0.66      7673
       Fire       0.00      0.00      0.00      2250
    Traffic       0.29      0.01      0.02      5616

avg / total       0.35      0.49      0.33     15539



  'precision', 'predicted', average, warn_for)


In [21]:
#decision tree?
dec_tree = tree.DecisionTreeClassifier()

In [22]:
#fit and predict
dec_tree.fit(X_train, y_train)
y_pred = dec_tree.predict(X_test)

In [23]:
#metrics
print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        EMS       0.61      0.61      0.61      7673
       Fire       0.20      0.22      0.21      2250
    Traffic       0.56      0.54      0.55      5616

avg / total       0.53      0.53      0.53     15539



In [24]:
#Better!

In [25]:
Ada_boost = ensemble.AdaBoostClassifier(base_estimator=dec_tree)

In [26]:
#fit and predict
Ada_boost.fit(X_train, y_train)
y_pred = Ada_boost.predict(X_test)

In [27]:
#metrics
print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        EMS       0.59      0.68      0.63      7673
       Fire       0.20      0.11      0.14      2250
    Traffic       0.53      0.51      0.52      5616

avg / total       0.51      0.54      0.52     15539



In [28]:
bag_class = ensemble.bagging.BaggingClassifier(base_estimator=dec_tree)

In [29]:
#fit and predict
bag_class.fit(X_train, y_train)
y_pred = bag_class.predict(X_test)

In [30]:
#metrics
print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        EMS       0.62      0.70      0.66      7673
       Fire       0.21      0.14      0.17      2250
    Traffic       0.58      0.55      0.57      5616

avg / total       0.55      0.57      0.55     15539



## We can use get_address to find the missing data - let's look at this

In [31]:
#we should get the missing data first but I've not done that yet

In [32]:
#Let's try get features from the addr column
vectorizer = feature_extraction.text.TfidfVectorizer(min_df=3,  max_features=5000, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

In [33]:
#vectorize addr column
addr = vectorizer.fit_transform(df.addr.astype('str'))

In [34]:
X.shape

(77691, 5)

In [35]:
addr = addr.todense()
addr.shape

(77691L, 5000L)

In [36]:
New_X = pd.concat([X, pd.DataFrame(addr)], axis=1)
New_X.shape

(77691, 5005)

In [39]:
#Train test split again
X_train, X_test, y_train, y_test = cross_validation.train_test_split(New_X, y, test_size=0.2)

In [40]:
#fit and predict
dec_tree.fit(X_train, y_train)
y_pred = dec_tree.predict(X_test)

In [41]:
#metrics
print metrics.classification_report(y_test, y_pred)

             precision    recall  f1-score   support

        EMS       0.65      0.68      0.66      7589
       Fire       0.21      0.22      0.21      2223
    Traffic       0.67      0.62      0.65      5727

avg / total       0.59      0.59      0.59     15539



## If we are ready to make a submission:

In [None]:
#use df_test and your best model to make predictions

In [42]:
#We need the same X features for the test data
df_test_X = df_test[['lat','lng',]]
df_test_X = pd.concat([df_test_X, df_test.timeStamp.dt.dayofweek, df_test.timeStamp.dt.hour, df_test.timeStamp.dt.month], axis=1)

In [43]:
#addr.transposee addr column - DO NOT FIT TRANSFORM AS WE ALREADY FIT THIS
df_test_addr = vectorizer.transform(df_test.addr.astype('str'))

In [44]:
df_test_addr = df_test_addr.todense()
df_test_addr = pd.DataFrame(df_test_addr, index=df_test_X.index)

In [45]:
df_test_New_X = pd.concat([df_test_X, df_test_addr], axis=1)

### Submission format needs to be id & emergencytype

In [46]:
#make your predictions
df_test_pred = dec_tree.predict(df_test_New_X)

In [50]:
submission = pd.DataFrame(df_test_pred, index=df_test.index, columns=['emergencytype'])

In [51]:
submission.head()

Unnamed: 0_level_0,emergencytype
id,Unnamed: 1_level_1
77691,Fire
77692,Fire
77693,Traffic
77694,EMS
77695,EMS


In [52]:
submission.to_csv("team_911_submission.csv")