In [84]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import datetime

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.model_selection import GridSearchCV, KFold, train_test_split

In [50]:
# Predict whether a call is for a fire 

In [51]:
df_data_09_11 = pd.read_csv('LFB incident data 1 Jan 2009 to 31 Dec 2011.csv', index_col=0)
df_data_12_15 = pd.read_csv('LFB incident data 1 Jan 2012 to 31 Aug 2015.csv', index_col=0)

In [52]:
# Concatenate them into a single dataframe
df_incident_data = pd.concat([df_data_09_11, df_data_12_15])

In [85]:
df_incident_data.head()

Unnamed: 0_level_0,DateOfCall,TimeOfCall,IncidentGroup,StopCodeDescription,SpecialServiceType,PropertyCategory,PropertyType,AddressQualifier,Postcode_full,Postcode_district,...,FRS,IncidentStationGround,FirstPumpArriving_AttendanceTime,FirstPumpArriving_DeployedFromStation,SecondPumpArriving_AttendanceTime,SecondPumpArriving_DeployedFromStation,NumStationsWithPumpsAttending,NumPumpsAttending,is_fire,Month
IncidentNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
235138081,01-Jan-09,00:00:37,Special Service,Special Service,RTC,Road Vehicle,Car,In street close to,SW11 4LB,SW11,...,London,Battersea,319.0,Battersea,342.0,Clapham,2.0,2.0,Other,Jan
1091,01-Jan-09,00:00:46,Special Service,Special Service,Assist other agencies,Outdoor,Lake/pond/reservoir,Open land/water - nearest address to access,SE1 7SG,SE1,...,London,Lambeth,0.0,0,0.0,0,0.0,0.0,Other,Jan
2091,01-Jan-09,00:03:00,Fire,Secondary Fire,0,Outdoor,Road surface/pavement,In street outside,N9 9EL,N9,...,London,Edmonton,308.0,Edmonton,0.0,0,1.0,1.0,Fire,Jan
3091,01-Jan-09,00:04:27,Fire,Secondary Fire,0,Outdoor,Domestic garden (vegetation not equipment),On land associated with building,UB10 0DG,UB10,...,London,Hillingdon,210.0,Hillingdon,0.0,0,1.0,1.0,Fire,Jan
5091,01-Jan-09,00:05:39,Fire,Secondary Fire,0,Outdoor,Cycle path/public footpath/bridleway,In street outside,N7 8HG,N7,...,London,Holloway,233.0,Holloway,250.0,Holloway,1.0,2.0,Fire,Jan


In [92]:
df_incident_data.columns

Index(['DateOfCall', 'TimeOfCall', 'IncidentGroup', 'StopCodeDescription',
       'SpecialServiceType', 'PropertyCategory', 'PropertyType',
       'AddressQualifier', 'Postcode_full', 'Postcode_district',
       'IncGeo_BoroughCode', 'IncGeo_BoroughName', 'IncGeo_WardCode',
       'IncGeo_WardName', 'Easting_m', 'Northing_m', 'Easting_rounded',
       'Northing_rounded', 'FRS', 'IncidentStationGround',
       'FirstPumpArriving_AttendanceTime',
       'FirstPumpArriving_DeployedFromStation',
       'SecondPumpArriving_AttendanceTime',
       'SecondPumpArriving_DeployedFromStation',
       'NumStationsWithPumpsAttending', 'NumPumpsAttending', 'is_fire',
       'Month'],
      dtype='object')

In [53]:
# What are the possible calls that the fire service get?
df_incident_data.IncidentGroup.unique()

array(['Special Service', 'Fire', 'False Alarm'], dtype=object)

In [54]:
# Target variable, IncidentGroup == Fire

In [55]:
df_incident_data['is_fire'] = np.where(df_incident_data.IncidentGroup == 'Fire', 'Fire', 'Other')

In [56]:
#df_incident_data.head()

In [57]:
#df_incident_data.TimeOfCall

In [None]:
# TODO: Data Cleansing
# more specific cleansing by column, i.e 

In [None]:
def data_cleansing(dataframe):
    #imputation, set all nan to zero for now, can be more specific and sort strategy for individual columns
    dataframe.fillna(value=0, inplace=True)

In [None]:
df_incident_data.isnull().values.any()

In [None]:
data_cleansing(df_incident_data)

In [None]:
df_incident_data.isnull().values.any()

In [58]:
# TODO: Feature engineering
# Segment PropertyType into broader categories
# Day of Week

In [87]:
df_incident_data.PropertyCategory.unique()

array(['Road Vehicle', 'Outdoor', 'Dwelling', 'Outdoor Structure',
       'Other Residential', 'Non Residential', 'Aircraft', 0,
       'Rail Vehicle', 'Boat'], dtype=object)

In [None]:
#Not too broad

In [88]:
df_incident_data.PropertyType.unique()

array(['Car ', 'Lake/pond/reservoir ', 'Road surface/pavement ',
       'Domestic garden (vegetation not equipment) ',
       'Cycle path/public footpath/bridleway ',
       'Purpose Built Flats/Maisonettes - Up to 3 storeys ',
       'Refuse/rubbish tip ', 'Small refuse/rubbish container',
       'Student Hall of Residence ', 'House - single occupancy ',
       'Converted Flat/Maisonette - Up to 2 storeys ', 'Loose refuse ',
       'Department Store ',
       'Purpose Built Flats/Maisonettes - 4 to 9 storeys ', 'Park ',
       'Laundrette ',
       'Purpose Built Flats/Maisonettes - 10 or more storeys ',
       'Hotel/motel ', 'Restaurant/cafe', 'Other outdoor location ',
       'Large refuse/rubbish container (eg skip, paladin)', 'Museum ',
       'House in Multiple Occupation - 3 or more storeys (not known if licensed)',
       'Post box ', 'Converted Flat/Maisonettes - 3 or more storeys',
       'Casino ', 'Kiosk ', 'Club/night club ', 'Nursing/Care Home',
       'Hostel (e.g. for 

In [None]:
# needs aggregating

In [89]:
df_incident_data.AddressQualifier.unique()

array(['In street close to',
       'Open land/water - nearest address to access', 'In street outside',
       'On land associated with building', 'Correct incident address',
       'On motorway / elevated road',
       'In street remote from postal address', 'Within same building',
       'Nearby address - street not listed',
       'Nearby address - no building in street',
       'Railway land or rolling stock'], dtype=object)

In [None]:
def aggregate_address_qualifiers(dataframe):
    ['Car ', 'Lake/pond/reservoir ', 'Road surface/pavement ',
       'Domestic garden (vegetation not equipment) ',
       'Cycle path/public footpath/bridleway ',
       'Purpose Built Flats/Maisonettes - Up to 3 storeys ',
       'Refuse/rubbish tip ', 'Small refuse/rubbish container',
       'Student Hall of Residence ', 'House - single occupancy ',
       'Converted Flat/Maisonette - Up to 2 storeys ', 'Loose refuse ',
       'Department Store ',
       'Purpose Built Flats/Maisonettes - 4 to 9 storeys ', 'Park ',
       'Laundrette ',
       'Purpose Built Flats/Maisonettes - 10 or more storeys ',
       'Hotel/motel ', 'Restaurant/cafe', 'Other outdoor location ',
       'Large refuse/rubbish container (eg skip, paladin)', 'Museum ',
       'House in Multiple Occupation - 3 or more storeys (not known if licensed)',
       'Post box ', 'Converted Flat/Maisonettes - 3 or more storeys',
       'Casino ', 'Kiosk ', 'Club/night club ', 'Nursing/Care Home',
       'Hostel (e.g. for homeless people) ',
       "Nurses'/Doctors' accommodation ",
       'Self contained Sheltered Housing ', 'Shelter ',
       'Unlicensed House in Multiple Occupation - Up to 2 storeys ',
       'Licensed House in Multiple Occupation - Up to 2 storeys ',
       'Multi-Storey car park', 'Other bulk storage',
       'Multiple Vehicles ',
       'Roadside furniture (eg lamp posts, road signs, telegraph poles, speed cameras)',
       'Monastery/convent ', 'Purpose built office ', 'Hospital ',
       'Other building/use not known ', 'Airport - terminal ', 'Railway ',
       'Railings', 'Tenement Building ', 'Local Government Office',
       'Other retail warehouse ', 'Single shop ', 'Factory ',
       'Shopping Centre ', 'Retirement/Old Persons Home', 'Tree scrub ',
       'Sheltered Housing : not self contained ', 'Large supermarket ',
       'Bridge',
       'House in Multiple Occupation - Up to 2 storeys (not known if licensed) ',
       'Train station - platform (below ground) ',
       'Grassland, pasture, grazing etc ',
       'Other medical establishment (including surgery) ',
       'Other road vehicle', 'DIY Warehouse ', 'Prison ',
       'Pub/wine bar/bar ', 'Bank/Building Society ',
       'Boarding House/B&B for homeless/asylum seekers ',
       'Takeaway, fast food ', 'Gym ',
       'Airport building (not terminal or hangar) ',
       'Bungalow - single occupancy ', 'Scrub land ', 'Van ',
       'Sports/Social club ', 'Converted office ',
       'Other Residential Home ', 'Motorcycle ', 'Exhibition Centre ',
       'Lorry/HGV ', 'College/University ', 'Bus/coach ', 'Library ',
       'Infant/Primary school', 'Police station ', 'Call Centre ',
       'Other office/call centre type building',
       'Train station - elsewhere ', 'Health Centre ',
       'Railway trackside vegetation ',
       'Caravan/Mobile home (permanent dwelling)', 'Pre School/nursery ',
       'Licensed House in Multiple Occupation - 3 or more storeys ',
       'Theatre ', 'Manufacturing assembly plant', 'Church/Chapel ',
       'Community centre/Hall ',
       'Unlicensed House in Multiple Occupation - 3 or more storeys ',
       'Football stadium ', 'Printing works', 'Roadside vegetation ',
       'Electricity power station ', 'Temple ', 'Electrical warehouse ',
       'Pipe or drain ', 'Warehouse ', 'Light aircraft ',
       'Other outdoor structures ', 'Leisure Centre ', 'Hairdresser ',
       'Vehicle Repair Workshop', 'Other outdoor equipment/machinery ',
       'Cemetery ', 'Art Gallery ', 'Underground car park',
       'Outdoor storage ', 'Private Garden Shed ', 'Wasteland ',
       'Other public building ', 'Secondary school', 'Bingo Hall ',
       'Mill ', 'Train station - concourse ', 'TV/film/music/art studio ',
       'River/canal ', 'Other Dwelling ', 'Bus/coach station/garage ',
       'False Alarm - Property not found', 'Telephone box ',
       'Fire station ', 'Synagogue ', 'Houseboat (permanent dwelling) ',
       'Furniture warehouse ', 'Conference Centre ',
       'Train station - platform (at ground level or elevated) ',
       'Town Hall ', 'Law Courts ', 'Cinema ', 'Airport - hangar ',
       'Intensive Farming Sheds (chickens, pigs etc) ', 'Swimming Pool ',
       'Other industrial processing plant', 'Other retail  ',
       'Laboratory/research Establishment ', 'Central Government Office',
       'Engineering manufacturing plant', 'Ice rink ',
       'Other car park structure', 'Other education establishment',
       'Woodland/forest - broadleaf/hardwood ', 'Petrol station ',
       'Tunnel, subway ', 'Post office (purpose built) ', 'Youth hostel ',
       'Motor Home ', "Children's Home", 'Gas works ',
       'Nurseries, market garden ', 'Public toilets ', 'Recycling plant',
       'Cathedral ', 'Other public utility works',
       'Other private non-residential building ', 'Indoor Market ',
       'Cables ', 'Military/barracks ', 'Other agricultural building ',
       'Heathland ', 'Other industrial manufacturing facility',
       'Cricket ground ', 'Other animal boarding/breeding establishment',
       'Recycling collection point, bottle bank',
       'Other Religious Use Building', 'Private garage ',
       'Animal products processing plant',
       'Post office (within other shop/premises) ', 'Passenger plane ',
       'Garden equipment ', 'Temporary office (eg portacabin) ',
       'Other transport building ', 'Chemical plant', 'Caravan on tow ',
       'Other outdoor sporting venue ',
       'Towing caravan/Camper van on site', 'Estate Agent ',
       'Train on Tube network', 'Vehicle sales building',
       'Sports pavilion/shower block/changing facility ',
       'Other indoor sporting venue ', 'Young offenders unit ',
       'Bulk waste storage', 'Ambulance station ', 'Tram ',
       'Tennis Courts ', 'Water works ', 'Boarding School accommodation ',
       'Boarding House/B&B other ', 'Other cultural venue ',
       'Other entertainment venue ', 'Barge ', 'Beach ', 'Barbeque ',
       'Trains - engine shed ', 'Sports Hall ',
       'Bulk hazardous materials storage', 'Minibus ',
       'Passenger Train (national rail network) ',
       'Underground train : Other system ', 'Railway building - other',
       'Rugby Stadium ', 'Canal/riverbank vegetation ',
       'Agricultural vehicle',
       'Animal boarding/breeding establishment - dogs', 'Oil refinery ',
       'Mosque ', 'Airport - fuel storage ', 'Concert Hall ',
       'Other vessel ', 'Bulk gas storage', 'Bulk oil storage',
       'Straw/stubble burning ', 'Boat - Tanker', 'Airfield/runway ',
       'Landfill site ', 'Woodland/forest - conifers/softwood ',
       'Mine or quarry building above ground', 'Greyhound stadium ',
       'Tractor Shed ', 'Theme Park ',
       'Mine or quarry (not above ground building)', 'Indoor stadium ',
       'Road Tanker ', 'Docks ', 'Private greenhouse ', 'Motor yacht ',
       'Railway goods yard ', 'Stacked/baled crop ', 'Travel Agent ',
       'Private Summer house ', 'Barn ', 'Standing crop ',
       'Camping tent ', 'Agricultural equipment ', 'Zoo ',
       'Naval vessel ', 'Freight Train ', 'Helicopter ', 'Other aircraft',
       'Large passenger vessel ',
       'Other holiday residence (cottage, flat, chalet) ',
       'Other tent/marquee ', 'Sea ', 'Milking Parlour ', 'Sewage works ',
       'Animal boarding/breeding establishment - cats',
       'Athletics Stadium ', 'Other merchant vessel ',
       'Motor racing circuit ', 'Freight plane ', 'Fishing boat ',
       'Telephone exchange', 'Greenhouse (commercial) polytunnel ',
       'Ferry terminal ', 'Military helicopter ',
       'Greenhouse (commercial) glass ', 'Nursing/Care Home/Hospice',
       'Day care/Drop in centre', 'Common external bin storage area',
       'Hedge', 'Human harm outdoors', 'Fence', 'Medical/health centre',
       'Bicycle', 'Dental surgery', 'Doctors surgery',
       'Ministry of Defence office', 'Golf clubhouse',
       'Playground/Recreation area (not equipment)',
       'Food and drink processing',
       'Towing caravan (not on tow or on site)', 'Health spa/farm',
       'Trailer (not attached to tractor unit)', 'Animal harm outdoors',
       'Distillery plant', 'Bakery', 'Veterinary surgery',
       'Golf course (not building on course)',
       'Stately Home (part not open to public)',
       'Royal Palace (part not open to public)', 'Racecourse ', 'Silo ',
       'Wheelie bin (domestic size)', 'Castle (part not open to public)']
    dataframe['']

In [59]:
def feature_engineering(dataframe):
    #dataframe['is_early_morning'] = np.where(df_incident_data.TimeOfCall <= datetime0600:00, "Yes", "No")
    dataframe['Month'] = dataframe.DateOfCall.str.split("-", expand=True)[1]
    aggregate_address_qualifiers(dataframe)

In [62]:
feature_engineering(df_incident_data)

In [63]:
#df_incident_data.Month.isnull().values.any()

In [None]:
# Data processing for features
# encode categorical variables to numerical

In [None]:
# Model training

In [77]:
def split_dataset(dataframe, train_percentage=0.7):
    """
    Split the dataset with train_percentage
    """
    y = df_incident_data['is_fire']
    x = df_incident_data.drop(['IncidentGroup', 'is_fire'], 1)
    # need to drop IncidentGroup as this is the actual target but transformed to allow for binary classification
 
    # Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=train_percentage)
    
    return train_x, test_x, train_y, test_y

In [78]:
train_x, test_x, train_y, test_y = split_dataset(df_incident_data)



In [81]:
def random_forest_classifier(features, target):
    random_forest = RandomForestClassifier()
    random_forest.fit(features, target)
    return random_forest

In [82]:
random_forest = random_forest_classifier(train_x, train_y)

ValueError: could not convert string to float: 'Nov'

In [None]:
#TODO: Classifier then look at interractions between variables and only use the most important