In [None]:
## Declare functions

In [None]:
"""Declare functions that import, normalize, and split data; train random forests model;
return a dataframe of addresses, latitudes, longitudes, and probabilities of a fire.
For now, functions require a single CSV of model inputs. 
In the future, we can transition to a SQL call to our database."""

In [1]:
import pandas as pd
import pyglet
import geoplotlib

In [2]:
def XY_data(input_file=None,multiclass=False):
    #will process binary or multiclass

    k=pd.read_csv(input_file,low_memory=False,)

    # set target to Fire Incident Type
    y=k.pop('Incident_Cat')
    
    # assign classes
    # Nan becomes no incident
    # Values either become an incident or classes of incidents
    y=y.apply(lambda x:'0 No incident' if pd.isnull(x) else x if multiclass else '1 Incident')

    #store class labels
    unique=sorted(y.unique())

    #substitue class index number for class description
    y=y.apply(lambda x:unique.index(x))

    # set x to remaining data
    x=k
    #calculate property age
    x['age']=2016-x.Yr_Property_Built
    #create one-hot variables for property type and neighborhood

    return x,y,unique

In [3]:
def Data_normalized(input_file=None,multiclass=False):

    x,y,unique=XY_data(input_file=input_file,multiclass=multiclass)

    x_dummies=pd.get_dummies(data=x[['Building_Cat','Neighborhood']])

    # get quantitative features

    x_quantitative=x[['age','Num_Bathrooms', 'Num_Bedrooms',
           'Num_Rooms', 'Num_Stories', 'Num_Units', 'Land_Value',
           'Property_Area', 'Assessed_Improvement_Val', 'Tot_Rooms','Perc_Ownership' ,
                      'count potential fire control', 'count all complaints',
                      'count all complaints not corrected',
                      'count potential fire control not corrected',
                      'count fire emergency safety', 'count potential fire cause',
                      'count fire emergency safety not corrected',
                      'count potential fire cause not corrected'

                      ]]

    x_ids=x[['EAS','Address','Location_y']]
    #normalize quantitative features
    x_scaled=(x_quantitative-x_quantitative.mean())/(x_quantitative.max()-x_quantitative.min())


    #combine x dummies and x scaled data
    x_all=pd.concat([x_dummies,x_scaled],axis=1)

    return x_all,y,unique,x_ids

In [4]:
def classifier(train=True,x=None,y=None,target_names=None,class_weight=None,multiclass=False,plot=False,cross_val=False):
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.multiclass import OneVsRestClassifier
    
    # use multiclass random forest classifier for both binary and multiclass
    if multiclass:

        rf_model=OneVsRestClassifier(RandomForestClassifier(verbose=0,class_weight=class_weight),n_jobs=3)
    else:
        rf_model = RandomForestClassifier(verbose=0, class_weight=class_weight)
            
    print len(x),len(y)
    
    xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=.33)

    ## changing this to skip the if multiclass = true because I'm doing 0/1
    ## and also ditch all the visualizations and also saving with file 
    ## because pickle is being weird on my machine :p
    
    train=train

    if train: # run training and export to file
        import csv
        rf_model.fit(xtrain,ytrain)
        predictions = rf_model.predict_proba(x)
    
    return predictions

In [5]:
def predict_out(prediction_array=None,original_file=None):
    k2=pd.read_csv(original_file,low_memory=False,)
    predictions=pd.DataFrame(prediction_array)
    output=pd.concat([k2,predictions],axis=1)
    output.to_csv(path_or_buf='model_output_120517.csv',
                  header=True,Index=False)
    return output

In [6]:
def output(input_file=None):
    if __name__ == '__main__':
        multiclass = False
        x,y,target_names,x_ids=Data_normalized(input_file=input_file,multiclass=multiclass)
        predictions=classifier(train=True,x=x,y=y,target_names=target_names, class_weight=None,multiclass=multiclass,plot=False,cross_val=False)
        output=predict_out(prediction_array=predictions,original_file=input_file)
        return output

In [None]:
## Call data on most recent model inputs file (incl. census tracks)

In [7]:
dataset=output(input_file='masterdf_inc_census_tract.csv')

195308 195308


In [8]:
    # visual check of lat field
dataset.x[0:3]

0   -122.480327
1   -122.418358
2   -122.418358
Name: x, dtype: float64

In [9]:
    # rename fields for mapping
dataset=dataset.rename(columns={'x':'lon','y':'lat',0:'probability no fire',1:'probability fire'})
mapslice=dataset[['lat','lon','probability fire']]

In [25]:
## A first pass with Geoplotlib

In [13]:
from geoplotlib.utils import epoch_to_str, BoundingBox

In [14]:
    # set window and call data on mapslice; use I to zoom in, D to pan right, and W to pan up

geoplotlib.set_bbox(BoundingBox(north=37.58,
                               south=37.7,
                               east=-122.281780,
                               west= -123.02))
geoplotlib.dot(mapslice)
geoplotlib.show()

In [36]:
""" Map Requirements

    1. A single dot per building -- no duplicates
    2. Dot color based on probability of fire (0-1)
    3. The ability to search a property: zoom in on a specific data point
    4. Interactive labels when you hover over a dot
    
    Conclusion: geoplotlib is pretty and would probably allow us to build 
    these things custom in the nicest way, but it does not have enough built
    in functionalities to get us to MVP ASAP. 
    
    Stick with the Carto API!
    
""""