Possible final dataset

In [20]:
import pandas as pd
import numpy as np
import julian
import datetime as dt
import sqlite3

In [21]:
#Take a look at all the fires.  This may take some time.
con = sqlite3.connect("kaggle_dataset.sqlite")
df = pd.read_sql_query("SELECT * from Fires", con)
con.close();


In [22]:
size_cols = ['FIRE_SIZE_CLASS', 'DISCOVERY_DATE', 'DISCOVERY_TIME', 'CONT_DATE', 'CONT_TIME']
df = df.dropna(subset=size_cols) # remove rows where both of these are missing
df.shape

(892007, 39)

In [23]:
#Build up datasets 

In [24]:
#To make these dates and times easier to manage, let's convert them to datetime. We can add new columns DISCOVERY_DATETIME and CONTAINMENT_DATETIME.
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATE'];
df['CONT_DATETIME'] = df['CONT_DATE'];

In [25]:
#To populate those two rows, let's convert them into datetime.
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATETIME'].apply(lambda x: julian.from_jd(x, fmt="jd"))
df['CONT_DATETIME'] = df['CONT_DATETIME'].apply(lambda x:julian.from_jd(x, fmt="jd"))


In [26]:
#Let's also add the time 
temp_df = pd.DataFrame();
temp_df['dt'] = df['DISCOVERY_TIME'].apply(lambda x: dt.timedelta(hours=int(x[0:2]), minutes=int(x[2:5])))
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATETIME'] + temp_df['dt']
df['DISCOVERY_DATETIME'].head()

0   2005-02-02 13:00:00
1   2004-05-12 08:45:00
2   2004-05-31 19:21:00
3   2004-06-28 16:00:00
4   2004-06-28 16:00:00
Name: DISCOVERY_DATETIME, dtype: datetime64[ns]

In [27]:
#Do the same thing for CONT_DATETIME
temp_df = pd.DataFrame();
temp_df['dt'] = df['CONT_TIME'].apply(lambda x: dt.timedelta(hours=int(x[0:2]), minutes=int(x[2:5])))
df['CONT_DATETIME'] = df['CONT_DATETIME'] + temp_df['dt']
df['CONT_DATETIME'].head()

0   2005-02-02 17:30:00
1   2004-05-12 15:30:00
2   2004-05-31 20:24:00
3   2004-07-03 14:00:00
4   2004-07-03 12:00:00
Name: CONT_DATETIME, dtype: datetime64[ns]

In [28]:
df.to_csv("data_with_target.csv")
#Checkpoint to save the data with the above columns before reloading

In [32]:
df = pd.read_csv("data_with_target.csv")

In [33]:
df['CONT_DATETIME'] = df['CONT_DATETIME'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df['DISCOVERY_DATETIME'] = df['DISCOVERY_DATETIME'].apply(lambda x: dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df['TIME_TO_CONT'] = 0;
df['TIME_TO_CONT'] = df['CONT_DATETIME'] - df['DISCOVERY_DATETIME']

In [34]:
#Save intermediate copy
df.to_pickle("data_with_target.pkl")

In [35]:
df.columns.values

array(['Unnamed: 0', 'OBJECTID', 'FOD_ID', 'FPA_ID', 'SOURCE_SYSTEM_TYPE',
       'SOURCE_SYSTEM', 'NWCG_REPORTING_AGENCY', 'NWCG_REPORTING_UNIT_ID',
       'NWCG_REPORTING_UNIT_NAME', 'SOURCE_REPORTING_UNIT',
       'SOURCE_REPORTING_UNIT_NAME', 'LOCAL_FIRE_REPORT_ID',
       'LOCAL_INCIDENT_ID', 'FIRE_CODE', 'FIRE_NAME',
       'ICS_209_INCIDENT_NUMBER', 'ICS_209_NAME', 'MTBS_ID',
       'MTBS_FIRE_NAME', 'COMPLEX_NAME', 'FIRE_YEAR', 'DISCOVERY_DATE',
       'DISCOVERY_DOY', 'DISCOVERY_TIME', 'STAT_CAUSE_CODE',
       'STAT_CAUSE_DESCR', 'CONT_DATE', 'CONT_DOY', 'CONT_TIME',
       'FIRE_SIZE', 'FIRE_SIZE_CLASS', 'LATITUDE', 'LONGITUDE',
       'OWNER_CODE', 'OWNER_DESCR', 'STATE', 'COUNTY', 'FIPS_CODE',
       'FIPS_NAME', 'Shape', 'DISCOVERY_DATETIME', 'CONT_DATETIME',
       'TIME_TO_CONT'], dtype=object)

In [40]:
#Let's take out only the features we can use.  Last column is the labels.

In [41]:
x = df[['DISCOVERY_DATETIME', 'STATE', 'TIME_TO_CONT', 'FIRE_YEAR', 'STAT_CAUSE_CODE', 'LATITUDE', 'LONGITUDE', 'FIRE_SIZE_CLASS']]

In [42]:
x.head()

Unnamed: 0,DISCOVERY_DATETIME,STATE,TIME_TO_CONT,FIRE_YEAR,STAT_CAUSE_CODE,LATITUDE,LONGITUDE,FIRE_SIZE_CLASS
0,2005-02-02 13:00:00,CA,0 days 04:30:00,2005,9.0,40.036944,-121.005833,A
1,2004-05-12 08:45:00,CA,0 days 06:45:00,2004,1.0,38.933056,-120.404444,A
2,2004-05-31 19:21:00,CA,0 days 01:03:00,2004,5.0,38.984167,-120.735556,A
3,2004-06-28 16:00:00,CA,4 days 22:00:00,2004,1.0,38.559167,-119.913333,A
4,2004-06-28 16:00:00,CA,4 days 20:00:00,2004,1.0,38.559167,-119.933056,A


In [43]:
#Convert discovery datetime to month of year
x['DISCOVERY_DATETIME'] = x['DISCOVERY_DATETIME'].apply(lambda x: x.month)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [45]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
st =  enc.fit_transform(np.array(x['STATE']).reshape(-1,1))
x['STATE'] = pd.DataFrame(st)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [46]:
x.head()

Unnamed: 0,DISCOVERY_DATETIME,STATE,TIME_TO_CONT,FIRE_YEAR,STAT_CAUSE_CODE,LATITUDE,LONGITUDE,FIRE_SIZE_CLASS
0,2,4.0,0 days 04:30:00,2005,9.0,40.036944,-121.005833,A
1,5,4.0,0 days 06:45:00,2004,1.0,38.933056,-120.404444,A
2,5,4.0,0 days 01:03:00,2004,5.0,38.984167,-120.735556,A
3,6,4.0,4 days 22:00:00,2004,1.0,38.559167,-119.913333,A
4,6,4.0,4 days 20:00:00,2004,1.0,38.559167,-119.933056,A


In [49]:
x['TIME_TO_CONT'] = x['TIME_TO_CONT'].apply(lambda x: x.total_seconds()/3600)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [50]:
x.head()

Unnamed: 0,DISCOVERY_DATETIME,STATE,TIME_TO_CONT,FIRE_YEAR,STAT_CAUSE_CODE,LATITUDE,LONGITUDE,FIRE_SIZE_CLASS
0,2,4.0,4.5,2005,9.0,40.036944,-121.005833,A
1,5,4.0,6.75,2004,1.0,38.933056,-120.404444,A
2,5,4.0,1.05,2004,5.0,38.984167,-120.735556,A
3,6,4.0,118.0,2004,1.0,38.559167,-119.913333,A
4,6,4.0,116.0,2004,1.0,38.559167,-119.933056,A


In [None]:
di = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7}
y = y.map(di)

Final dataset from this file: