In [2]:
import numpy as np
import pandas as pd
from datetime import date, datetime

In [3]:
YEAR = 1992
COLS_NEEDED = [
    "FIRE_YEAR",
    "DISCOVERY_DATE",
    "DISCOVERY_DOY",
    "DISCOVERY_TIME",
    "STAT_CAUSE_CODE",
    "STAT_CAUSE_DESCR",
    "CONT_DATE",
    "CONT_DOY",
    "CONT_TIME",
    "FIRE_SIZE",
    "FIRE_SIZE_CLASS",
    "LATITUDE",
    "LONGITUDE",
    "STATE",
    "COUNTY"
]

In [4]:
with open('./wildfires_%d.csv' % YEAR) as csvfile:
    df = pd.read_csv(csvfile)
    df = df[COLS_NEEDED]

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
def get_season(day, Y):
    seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
              ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
              ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
              ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
              ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]
    for season in seasons:
        name = season[0]
        start, end = season[1]
        if (start.timetuple().tm_yday <= day <= end.timetuple().tm_yday):
            return name
    return None

def julian_to_datetime(jd_series):
    # https://www.kaggle.com/rtatman/188-million-us-wildfires/discussion/39627#222290
    epoch = pd.to_datetime(0, unit='s').to_julian_date()
    return pd.to_datetime(jd_series - epoch, unit='D')

def difference_in_seconds():
    start_times = df['DISCOVERY_TIME']
    end_times = df['CONT_TIME']
    return pd.to_datetime(end_times) - pd.to_datetime(start_times)

def num_to_time(num_time):
    # num_time is in 24-hour format - so 0 to 2359
    chars = list(str(int(num_time)))
    res = []
    for i in range(len(chars)-1, -1, -1):
      res.append(chars[i])
      if len(res) == 2:
        res.append(":")
    result = ''.join(res[::-1])
    if len(result) == 1: result = "00:0" + result
    if result[0] == ":": result = "00" + result
    return result

In [6]:
# Clean
df = df.dropna(subset=['CONT_DATE', 'DISCOVERY_DATE', 'CONT_TIME', 'DISCOVERY_TIME'])
df['CONT_TIME'] = df.apply(lambda row: num_to_time(row.CONT_TIME), axis=1)
df['DISCOVERY_TIME'] = df.apply(lambda row: num_to_time(row.DISCOVERY_TIME), axis=1)
df['CONT_DATE'] = df.apply(lambda row: julian_to_datetime(row.CONT_DATE), axis=1)
df['DISCOVERY_DATE'] = df.apply(lambda row: julian_to_datetime(row.DISCOVERY_DATE),axis=1)
df['season'] = df.apply(lambda row: get_season(row.DISCOVERY_DOY,YEAR), axis=1)

In [7]:
full_disc_date = pd.to_datetime(df['DISCOVERY_DATE'].dt.strftime("%Y-%m-%d") + ' ' + df['DISCOVERY_TIME'])
full_cont_date = pd.to_datetime(df['CONT_DATE'].dt.strftime("%Y-%m-%d") + ' ' + df['CONT_TIME'])
df['duration'] = full_cont_date - full_disc_date

In [46]:
US_STATE_CODES = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States
SEASONS = ['winter','spring','summer','autumn']
STAT_CAUSE_CODES = list(range(1,14))

REGIONS = {
    'northeast': ['CT','ME','MA','NH','RI','VT','NJ','NY','PA'],
    'midwest': ['IL','IN','MI','OH','WI','IA','KS','MN','MO','NE','ND','SD'],
    'west': ['AZ','CO','ID','MT','NV','NM','UT','WY','AK','CA','HI','OR','WA'],
    'south': ['DE','FL','GA','MD','NC','SC','VA','DC','WV','AL','KY','MS','TN','AR','LA','OK','TX']
}

def get_region(state_code):
    for region in REGIONS:
        if state_code in REGIONS[region]: return region
    return None

df['region'] = df.apply(lambda row: get_region(row.STATE), axis=1)

In [52]:
# TODO: Create classifier
# - extract features for each row
# - create labels for each row (this should just be getting the fire class size and mapping it to an integer)
# - split data into train + test sets (shuffle data first?)
# - train data using a model (need to look this up)
# - test for accuracy

label_names = [chr(ord('A') + i) for i in range(ord('G') - ord('A') + 1)]

# features: 'state', 'season', 'cause', 'duration' (maybe)

def extract_features(row):
    regions_list = list(REGIONS.keys())
    return [SEASONS.index(row.season), int(row.STAT_CAUSE_CODE), regions_list.index(row.region)]

In [70]:
feature_df = df[['STAT_CAUSE_CODE', 'season', 'region']]
X = [extract_features(row) for _,row in feature_df.iterrows()]
y = df['FIRE_SIZE_CLASS'].map(lambda fire_class: ord(fire_class) - ord('A'))

# https://stackoverflow.com/a/4602224/8109239
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return np.asarray(a)[p], np.asarray(b)[p]

X,y = unison_shuffled_copies(X,y)

In [71]:
# Split data into training and test sets
train_size = 15000
test_size = len(X) - train_size
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size+1:], y[train_size+1:]

In [72]:
# Use naive bayes because features are independent of each other
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(metrics.accuracy_score(y_test,y_pred))

0.595054683785


In [56]:
print(df['duration'])

0       0 days 02:40:00
1       0 days 23:00:00
2       2 days 02:30:00
3       0 days 00:10:00
4       0 days 00:30:00
5       0 days 03:00:00
6       0 days 01:57:00
7       0 days 01:30:00
8       0 days 18:00:00
9       1 days 01:29:00
10      0 days 07:59:00
11      0 days 05:00:00
12      0 days 03:00:00
13      0 days 06:15:00
14      0 days 03:29:00
15      0 days 04:58:00
16      0 days 01:20:00
17      0 days 19:00:00
18      0 days 06:59:00
19      0 days 03:30:00
20      0 days 23:00:00
21      0 days 02:30:00
22      1 days 00:30:00
23      0 days 02:00:00
24      0 days 01:45:00
25      0 days 01:25:00
26      1 days 03:45:00
27      0 days 12:00:00
28      0 days 15:00:00
29      0 days 07:30:00
              ...      
67868   0 days 00:19:00
67869   0 days 00:20:00
67870   0 days 00:45:00
67871   0 days 00:28:00
67872   0 days 01:04:00
67873   0 days 00:35:00
67874   0 days 00:14:00
67875   0 days 00:30:00
67876   0 days 05:30:00
67877   0 days 00:45:00
67878   0 days 0