# 1. Introduction

# 2. Literature review

# 3. Research question

# 4. Presentation of data

In [1]:
# Packages for data manipulation and processing
import math
import numpy as np
import pandas as pd
import os
import geopandas as gpd

# Packages for visualisation
import matplotlib.pyplot as plt # primary plotting package
import matplotlib.cm as cm
import matplotlib.ticker as ticker # plt ticker manipulation
import seaborn as sns; sns.set(style="ticks", color_codes=True) # for specialised plots

# Notebook settings
random_state = 42 # for reproducability
np.random.seed(random_state) # setting numpy random seed, as that is used for sklearn

In [None]:
# Reading in fire incident data
incidentData = pd.read_csv('https://data.london.gov.uk/download/london-fire-brigade-incident-records/73728cf4-b70e-48e2-9b97-4e4341a2110d/LFB%20Incident%20data%20-%20Datastore%20-%20with%20notional%20cost%20and%20UPRN%20from%20January%202009.zip',low_memory=False, 
                           infer_datetime_format=True,
                           parse_dates=['TimeOfCall', 'DateOfCall'], )

incidentData['DateOfCall'] = incidentData['DateOfCall'].dt.date
incidentData.head(5)

In [None]:
# Reading in house pricing data
housePriceData = pd.read_csv('https://data.london.gov.uk/download/average-house-prices/bdf8eee7-41e1-4d24-90ce-93fe5cf040ae/land-registry-house-prices-MSOA.csv',low_memory=False)
housePriceData = housePriceData.loc[housePriceData['Year'] == 'Year ending Dec 2017']
housePriceData = housePriceData.drop(columns = ['Year','Measure'])
housePriceData.head()

In [None]:
# Reading in income data
incomeData = pd.read_csv('https://raw.githubusercontent.com/kav-sekar/dsss_data/main/net%20annual%20household%20income.csv', low_memory=False)
incomeData = incomeData.drop(columns = ['Local authority code','Local authority name','Region code','Region name','Upper confidence limit (£)','Lower confidence limit (£)','Confidence interval (£)'])
incomeData.head()

In [None]:
# Combining income and housing price to create dataframe
soceconomicData = pd.merge(housePriceData, incomeData, left_on="Code", right_on="MSOA code",how="right")
soceconomicData.rename(columns = {'Value':'MedianHousePrice', 'Net annual income (£)':'NetAnnualIncome'}, inplace = True)
soceconomicData.replace(',','', regex=True, inplace=True)
soceconomicData.drop(columns = ['Code','Area','MSOA name'], inplace = True)
soceconomicData['NetAnnualIncome'] = soceconomicData['NetAnnualIncome'].apply(pd.to_numeric,errors='coerce')
soceconomicData.head()

In [None]:
incidentData.info()

The key variables in this data set that are being used in this analysis are as follows:
- <b>DateOfCall</b>: Date on which the incident call was made
- <b>CalYear</b>: Year in which the call was made
- <b>TimeOfCall</b>: Date and time when the call was made
- <b>IncidentGroup</b>: All incidents are classified into three categories namely Fire, False Alarm, Special Service
- <b>StopCodeDescription</b>: Detailed incident category description with further detail for special services incident categories. These include AFA (Automatic Fire Alarm), Chimney Fire, False alarm - Good intent, False alarm – Malicious, Flood call attended - Batch mobilised, Late Call, Primary Fire, Secondary Fire, Special Service and Use of Special Operations Room.
- <b>PropertyCategory</b>: High level property descriptor
- <b>PropertyType</b>: Detailed property descriptor
- <b>AddressQualifier</b>: Qualifies location of actual incident relevant to category above
- <b>Latitude</b>: Geographic data
- <b>Longitude</b>: Geographic data
- <b>IncidentStationGround</b>: LFB Station ground

The selected variables include the following; data that will be available for every instance of a future fire incident call received by LBF prior to dispatching help, labelled data required for the classification process using supervised learning methods, time series data and geographic data for each incident. The geographic data used is rounded to the closes 50 value for privacy reasons so as to not explicitly identify dwellings within the dataset. 

The geographic data is used to combine the incident data with the income and housing data that is available at the MSOA (Medium Super Output Area) level.


In [None]:
# Load the MSOA GeoData
url_msoas = "https://github.com/kav-sekar/fsds_data_brief/blob/main/clean_data/London_MSOAs.gpkg?raw=true"
msoas = gpd.read_file(url_msoas, driver='GPKG')
msoas = msoas.drop(columns=['OBJECTID','MSOA11NM','BNG_E','BNG_N','Borough','msoa11hclnm'])
msoas.head()

In [None]:
pdf = incidentData[['DateOfCall','TimeOfCall','IncidentGroup','StopCodeDescription','PropertyCategory','PropertyType','AddressQualifier','Latitude','Longitude','IncidentStationGround']].copy()
pdf.drop(pdf[((pdf.Latitude.isna())|(pdf.Longitude.isna()))].index, axis=0, inplace=True)
pdf.drop(pdf[((pdf.Latitude < 40)|(pdf.Longitude > 1))].index, axis=0, inplace=True)

In [None]:
# Creating geodata frame with airbnb listings
gdf = gpd.GeoDataFrame(pdf, geometry=gpd.points_from_xy(pdf['Longitude'], pdf['Latitude'], crs='epsg:4326'))
gdf = gdf.to_crs('epsg:27700')
#gdf.plot(column='CalYear', cmap='plasma', scheme='quantiles', k=10, markersize=1, figsize=(8,6));

In [None]:
# Aggregate incidents by MSOA
geoIncident = gpd.sjoin(gdf, msoas, op='within').drop(columns=['index_right'])
geoIncident.head(5)

In [None]:
fireData = pd.merge(geoIncident, soceconomicData, left_on="MSOA11CD", right_on="MSOA code",how="inner")
fireData = fireData.drop(columns = "MSOA11CD",)
fireData = fireData.set_geometry('geometry')
fireData.head(5)

In [None]:
fireDataNoGeo = fireData.drop(columns = 'geometry')
fireDataNoGeo.profile_report()

# 5. Methodology

## 5.1. Data validation and cleaning

## 5.2. Data pre-processing

## 5.3. Comprehensive analysis

# Results and Discussion

# Conclusion