In [55]:
import numpy as np
import pandas as pd
from datetime import date, datetime

In [70]:
YEAR = 1992
COLS_NEEDED = [
    "FIRE_YEAR",
    "DISCOVERY_DATE",
    "DISCOVERY_DOY",
    "DISCOVERY_TIME",
    "STAT_CAUSE_CODE",
    "STAT_CAUSE_DESCR",
    "CONT_DATE",
    "CONT_DOY",
    "CONT_TIME",
    "FIRE_SIZE",
    "FIRE_SIZE_CLASS",
    "LATITUDE",
    "LONGITUDE",
    "STATE",
    "COUNTY"
]

COLS_NEEDED_FOR_CLASSIFICATION = [
    "DISCOVERY_DATE",
    "DISCOVERY_DOY",
    "DISCOVERY_TIME",
    "STAT_CAUSE_CODE",
    "CONT_DATE",
    "CONT_TIME",
    "FIRE_SIZE_CLASS",
    "STATE"
]

In [71]:
with open('./wildfires_%d.csv' % YEAR) as csvfile:
    df = pd.read_csv(csvfile)
    df = df[COLS_NEEDED_FOR_CLASSIFICATION]

  interactivity=interactivity, compiler=compiler, result=result)


In [72]:
def get_season(day, Y):
    seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
              ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
              ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
              ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
              ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]
    for season in seasons:
        name = season[0]
        start, end = season[1]
        if (start.timetuple().tm_yday <= day <= end.timetuple().tm_yday):
            return name
    return None

def julian_to_datetime(jd_series):
    # https://www.kaggle.com/rtatman/188-million-us-wildfires/discussion/39627#222290
    epoch = pd.to_datetime(0, unit='s').to_julian_date()
    return pd.to_datetime(jd_series - epoch, unit='D')

def difference_in_seconds():
    start_times = df['DISCOVERY_TIME']
    end_times = df['CONT_TIME']
    return pd.to_datetime(end_times) - pd.to_datetime(start_times)

def num_to_time(num_time):
    # num_time is in 24-hour format - so 0 to 2359
    chars = list(str(int(num_time)))
    res = []
    for i in range(len(chars)-1, -1, -1):
      res.append(chars[i])
      if len(res) == 2:
        res.append(":")
    result = ''.join(res[::-1])
    if len(result) == 1: result = "00:0" + result
    if result[0] == ":": result = "00" + result
    return result

In [73]:
# Clean
df = df.dropna(subset=['CONT_DATE', 'DISCOVERY_DATE', 'CONT_TIME', 'DISCOVERY_TIME', 'DISCOVERY_DOY', 'STAT_CAUSE_CODE'])
df['CONT_TIME'] = df.apply(lambda row: num_to_time(row.CONT_TIME), axis=1)
df['DISCOVERY_TIME'] = df.apply(lambda row: num_to_time(row.DISCOVERY_TIME), axis=1)
df['CONT_DATE'] = df.apply(lambda row: julian_to_datetime(row.CONT_DATE), axis=1)
df['DISCOVERY_DATE'] = df.apply(lambda row: julian_to_datetime(row.DISCOVERY_DATE),axis=1)
df['season'] = df.apply(lambda row: get_season(row.DISCOVERY_DOY,YEAR), axis=1)

In [74]:
full_disc_date = pd.to_datetime(df['DISCOVERY_DATE'].dt.strftime("%Y-%m-%d") + ' ' + df['DISCOVERY_TIME'])
full_cont_date = pd.to_datetime(df['CONT_DATE'].dt.strftime("%Y-%m-%d") + ' ' + df['CONT_TIME'])
df['duration'] = full_cont_date - full_disc_date

# with open("./wildfires_dates.csv", "w") as csvfile:
#     df.to_csv(csvfile,
#               index=False, 
#               columns=['FIRE_SIZE', 'DISCOVERY_DATE', 'LATITUDE', 'LONGITUDE', 'duration'])

In [75]:
US_STATE_CODES = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

# https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States
SEASONS = ['winter','spring','summer','autumn']
STAT_CAUSE_CODES = list(range(1,14))

REGIONS = {
    'northeast': ['CT','ME','MA','NH','RI','VT','NJ','NY','PA'],
    'midwest': ['IL','IN','MI','OH','WI','IA','KS','MN','MO','NE','ND','SD'],
    'west': ['AZ','CO','ID','MT','NV','NM','UT','WY','AK','CA','HI','OR','WA'],
    'south': ['DE','FL','GA','MD','NC','SC','VA','DC','WV','AL','KY','MS','TN','AR','LA','OK','TX', 'PR']
}

def get_region(state_code):
    for region in REGIONS:
        if state_code in REGIONS[region]: return region
    print(state_code)
    return None

df['region'] = df.apply(lambda row: get_region(row.STATE), axis=1)

In [99]:
# TODO: Create classifier
# - extract features for each row
# - create labels for each row (this should just be getting the fire class size and mapping it to an integer)
# - split data into train + test sets (shuffle data first?)
# - train data using a model (need to look this up)
# - test for accuracy

label_names = [chr(ord('A') + i) for i in range(ord('G') - ord('A') + 1)]

# features: 'state', 'season', 'cause', 'duration' (maybe)

def extract_features(row):
    regions_list = list(REGIONS.keys())
#     duration_min = row.duration.days * 24 * 60 + row.duration.seconds//3600
    return [SEASONS.index(row.season), int(row.STAT_CAUSE_CODE), regions_list.index(row.region)]

feature_df = df[['STAT_CAUSE_CODE', 'season', 'region', 'duration']]
X = [extract_features(row) for _,row in feature_df.iterrows()]
y = df['FIRE_SIZE_CLASS'].map(lambda fire_class: ord(fire_class) - ord('A'))

In [105]:
# https://stackoverflow.com/a/4602224/8109239
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return np.asarray(a)[p], np.asarray(b)[p]

X,y = unison_shuffled_copies(X,y)

In [106]:
# Split data into training and test sets
TRAIN_FRAC = 0.7
TRAIN_SIZE = round(TRAIN_FRAC * len(X))
X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]

# Use naive bayes because features are independent of each other
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(metrics.accuracy_score(y_test,y_pred))

0.582755111481


In [None]:
# Save relevant data used for classification
# for year in range(1992,2016):
#     with open("./wildfires_%d.csv" % year) as csvtoread:
#         df = pd.read_csv(csvtoread)
#         with open("./csvsForClassification/trimmed_wildfires_%d.csv" % year, "w") as csvtowrite:
#             df.to_csv(csvtowrite, index=False)


In [56]:
class WildfireClassifier(object):
    COLS = [
        "DISCOVERY_DATE",
        "DISCOVERY_DOY",
        "DISCOVERY_TIME",
        "STAT_CAUSE_CODE",
        "STAT_CAUSE_DESCR",
        "CONT_DATE",
        "CONT_TIME",
        "FIRE_SIZE_CLASS",
        "FIRE_SIZE",
        "STATE"
    ]

    REGIONS = {
        'northeast': ['CT','ME','MA','NH','RI','VT','NJ','NY','PA'],
        'midwest': ['IL','IN','MI','OH','WI','IA','KS','MN','MO','NE','ND','SD'],
        'west': ['AZ','CO','ID','MT','NV','NM','UT','WY','AK','CA','HI','OR','WA'],
        'south': ['DE','FL','GA','MD','NC','SC','VA','DC','WV','AL','KY','MS','TN','AR','LA','OK','TX', 'PR']
    }
    
    US_STATE_CODES = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'PR', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']
    # https://en.wikipedia.org/wiki/List_of_regions_of_the_United_States
    SEASONS = ['winter','spring','summer','autumn']
    STAT_CAUSE_CODES = list(range(1,14))
    
    def __init__(self, fname, year):
        self.cleaned = False
        self.year = year
        with open(fname) as csvfile:
            self.df = pd.read_csv(csvfile)
            self.df = self.df[WildfireClassifier.COLS]
        
#         for name, group in self.df.groupby('STAT_CAUSE_DESCR'):
#             print(name, len(group))
#         print(self.df[['FIRE_SIZE', 'FIRE_SIZE_CLASS']].drop_duplicates())
            
    def get_season(self, day, Y):
        seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
                  ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
                  ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
                  ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
                  ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]
        for season in seasons:
            name = season[0]
            start, end = season[1]
            if (start.timetuple().tm_yday <= day <= end.timetuple().tm_yday):
                return name
        return None

    def julian_to_datetime(self, jd_series):
        # https://www.kaggle.com/rtatman/188-million-us-wildfires/discussion/39627#222290
        epoch = pd.to_datetime(0, unit='s').to_julian_date()
        return pd.to_datetime(jd_series - epoch, unit='D')

    def difference_in_seconds(self):
        start_times = self.df['DISCOVERY_TIME']
        end_times = self.df['CONT_TIME']
        return pd.to_datetime(end_times) - pd.to_datetime(start_times)

    def num_to_time(self, num_time):
        # num_time is in 24-hour format - so 0 to 2359
        chars = list(str(int(num_time)))
        res = []
        for i in range(len(chars)-1, -1, -1):
          res.append(chars[i])
          if len(res) == 2:
            res.append(":")
        result = ''.join(res[::-1])
        if len(result) == 1: result = "00:0" + result
        if result[0] == ":": result = "00" + result
        return result
    
    def clean(self):
        if self.cleaned: return
        self.df = self.df.dropna(subset=WildfireClassifier.COLS)
        self.df['CONT_TIME'] = self.df.apply(lambda row: self.num_to_time(row.CONT_TIME), axis=1)
        self.df['DISCOVERY_TIME'] = self.df.apply(lambda row: self.num_to_time(row.DISCOVERY_TIME), axis=1)
        self.df['CONT_DATE'] = self.df.apply(lambda row: self.julian_to_datetime(row.CONT_DATE), axis=1)
        self.df['DISCOVERY_DATE'] = self.df.apply(lambda row: self.julian_to_datetime(row.DISCOVERY_DATE),axis=1)
        self.df['season'] = self.df.apply(lambda row: self.get_season(row.DISCOVERY_DOY,self.year), axis=1)
        self.df['natural_cause'] = self.df.apply(lambda row: row.STAT_CAUSE_DESCR == 'Lightning', axis=1)
        full_disc_date = pd.to_datetime(self.df['DISCOVERY_DATE'].dt.strftime("%Y-%m-%d") + ' ' + self.df['DISCOVERY_TIME'])
        full_cont_date = pd.to_datetime(self.df['CONT_DATE'].dt.strftime("%Y-%m-%d") + ' ' + self.df['CONT_TIME'])
        self.df['duration'] = full_cont_date - full_disc_date

        def get_region(state_code):
            for region in WildfireClassifier.REGIONS:
                if state_code in WildfireClassifier.REGIONS[region]: return region
            print(state_code)
            return None
        self.df['region'] = self.df.apply(lambda row: get_region(row.STATE), axis=1)
        
        self.cleaned = True
    
    def test(self, train_frac):
        label_names = [chr(ord('A') + i) for i in range(ord('G') - ord('A') + 1)]

        def get_labels(fire_class):
            if ord(fire_class) <= ord('C'):
                return 0
            if ord(fire_class) <= ord('F'):
                return 1
            return 2
        # features: 'state', 'season', 'cause', 'duration' (maybe)
        def extract_features(row):
            regions_list = list(WildfireClassifier.REGIONS.keys())
        #     duration_min = row.duration.days * 24 * 60 + row.duration.seconds//3600
            return [WildfireClassifier.SEASONS.index(row.season), row.STAT_CAUSE_CODE, regions_list.index(row.region)]

        feature_df = self.df[['FIRE_SIZE_CLASS','season', 'region', 'duration', 'STAT_CAUSE_CODE', 'natural_cause', 'STATE']]
        
        
        X = [extract_features(row) for _,row in feature_df.iterrows()]
        y = self.df['FIRE_SIZE_CLASS'].map(lambda fire_class: ord(fire_class) - ord('A'))
        
        # https://stackoverflow.com/a/4602224/8109239
        def unison_shuffled_copies(a, b):
            assert len(a) == len(b)
            p = np.random.permutation(len(a))
            return np.asarray(a)[p], np.asarray(b)[p]
        X,y = unison_shuffled_copies(X,y)
        
        # Split data into training and test sets
        TRAIN_SIZE = round(train_frac * len(X))
        print("TRAIN_SIZE", TRAIN_SIZE)
        X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
        X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]

        # Use naive bayes because features are independent of each other
        from sklearn.naive_bayes import MultinomialNB
        from sklearn import metrics
        clf = MultinomialNB()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(metrics.confusion_matrix(y_test, y_pred))
        return metrics.accuracy_score(y_test,y_pred)
    
    def test2(self, train_frac):
        label_names = [chr(ord('A') + i) for i in range(ord('G') - ord('A') + 1)]

        def get_labels(fire_class):
            if ord(fire_class) <= ord('C'):
                return 0
            if ord(fire_class) <= ord('F'):
                return 1
            return 2
        # features: 'state', 'season', 'cause', 'duration' (maybe)
        def extract_features(row):
            regions_list = list(WildfireClassifier.REGIONS.keys())
        #     duration_min = row.duration.days * 24 * 60 + row.duration.seconds//3600
            return [WildfireClassifier.SEASONS.index(row.season), row.natural_cause, regions_list.index(row.region)]

        feature_df = self.df[['FIRE_SIZE_CLASS','season', 'region', 'duration', 'STAT_CAUSE_CODE', 'natural_cause', 'STATE']]
        
        
        X = [extract_features(row) for _,row in feature_df.iterrows()]
        y = self.df['FIRE_SIZE_CLASS'].map(lambda fire_class: ord(fire_class) - ord('A'))
        
        # https://stackoverflow.com/a/4602224/8109239
        def unison_shuffled_copies(a, b):
            assert len(a) == len(b)
            p = np.random.permutation(len(a))
            return np.asarray(a)[p], np.asarray(b)[p]
        X,y = unison_shuffled_copies(X,y)
        
        # Split data into training and test sets
        TRAIN_SIZE = round(train_frac * len(X))
        print("TRAIN_SIZE", TRAIN_SIZE)
        X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
        X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]

        # Use naive bayes because features are independent of each other
        from sklearn.naive_bayes import MultinomialNB
        from sklearn import metrics
        clf = MultinomialNB()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(metrics.confusion_matrix(y_test, y_pred))
        return metrics.accuracy_score(y_test,y_pred)

    def test3(self, train_frac):
        label_names = [chr(ord('A') + i) for i in range(ord('G') - ord('A') + 1)]

        def get_labels(fire_class):
            if ord(fire_class) <= ord('C'):
                return 0
            if ord(fire_class) <= ord('F'):
                return 1
            return 2
        # features: 'state', 'season', 'cause', 'duration' (maybe)
        def extract_features(row):
            regions_list = list(WildfireClassifier.REGIONS.keys())
        #     duration_min = row.duration.days * 24 * 60 + row.duration.seconds//3600
            return [WildfireClassifier.SEASONS.index(row.season), row.STAT_CAUSE_CODE, regions_list.index(row.region)]

        feature_df = self.df[['FIRE_SIZE_CLASS','season', 'region', 'duration', 'STAT_CAUSE_CODE', 'natural_cause', 'STATE']]
        
        
        X = [extract_features(row) for _,row in feature_df.iterrows()]
        y = self.df['FIRE_SIZE_CLASS'].map(get_labels)
        
        # https://stackoverflow.com/a/4602224/8109239
        def unison_shuffled_copies(a, b):
            assert len(a) == len(b)
            p = np.random.permutation(len(a))
            return np.asarray(a)[p], np.asarray(b)[p]
        X,y = unison_shuffled_copies(X,y)
        
        # Split data into training and test sets
        TRAIN_SIZE = round(train_frac * len(X))
        print("TRAIN_SIZE", TRAIN_SIZE)
        X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
        X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]

        # Use naive bayes because features are independent of each other
        from sklearn.naive_bayes import MultinomialNB
        from sklearn import metrics
        clf = MultinomialNB()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
#         print(metrics.confusion_matrix(y_test, y_pred))
        self.get_cause_labels()
        return metrics.accuracy_score(y_test,y_pred)
    
    def save_cleaned_data(self, path):
        if not self.cleaned: self.clean()
        df = self.df['season', 'region', 'STAT_CAUSE_CODE', 'duration']
        df.to_csv(path, index=False)
    
    def get_cause_labels(self):
        df = self.df[['STAT_CAUSE_CODE', 'STAT_CAUSE_DESCR']]
        print(df.unique())
    
clf_1992 = WildfireClassifier('./wildfires_1992.csv', 2000)
clf_1992.clean()

  if self.run_code(code, result):


In [16]:
print(clf_1992.test(.6))
# print(clf_1992.test2(.6))
# print(clf_1992.test3(.6))

TRAIN_SIZE 21619
[[4027 2068    0    0    0    0    0]
 [1862 4442    0    0    0    0    0]
 [ 369 1281    0    0    0    0    0]
 [  63  124    0    0    0    0    0]
 [  50   53    0    0    0    0    0]
 [  37   18    0    0    0    0    0]
 [  13    5    0    0    0    0    0]]
0.587635303913


In [23]:
clf_1992.df[['STAT_CAUSE_DESCR','STAT_CAUSE_CODE']].drop_duplicates()

Unnamed: 0,STAT_CAUSE_DESCR,STAT_CAUSE_CODE
0,Lightning,1.0
2,Campfire,4.0
16,Equipment Use,2.0
21,Debris Burning,5.0
23,Smoking,3.0
24,Arson,7.0
96,Miscellaneous,9.0
683,Railroad,6.0
759,Children,8.0
11452,Structure,12.0


In [57]:
result = []
for year in range(1992,2016):
    clf = WildfireClassifier('./wildfires_%d.csv' % year, year)
#     clf.train()
    clf.clean()
    result.append([year, clf.test(0.8)])
print(result)

    

  if self.run_code(code, result):


TRAIN_SIZE 28825
[[2014  969    0    0    0    0    0]
 [1016 2230    0    0    0    0    0]
 [ 169  630    0    0    0    0    0]
 [  35   63    0    0    0    0    0]
 [  24   19    0    0    0    0    0]
 [  14   12    0    0    0    0    0]
 [   9    2    0    0    0    0    0]]


  if self.run_code(code, result):


TRAIN_SIZE 26813
[[ 885 1529    0    0    0    0    0]
 [ 594 2720    0    0    0    0    0]
 [  97  708    0    0    0    0    0]
 [  14   79    0    0    0    0    0]
 [   7   29    0    0    0    0    0]
 [  17   10    0    0    0    0    0]
 [   5    9    0    0    0    0    0]]
TRAIN_SIZE 32772
[[2165 1176    0    0    0    0    0]
 [1289 2281    0    0    0    0    0]
 [ 269  667    0    0    0    0    0]
 [  84   80    0    0    0    0    0]
 [  64   35    0    0    0    0    0]
 [  43   15    0    0    0    0    0]
 [  23    2    0    0    0    0    0]]
TRAIN_SIZE 29474
[[1283 1313    0    0    0    0    0]
 [ 886 2722    0    0    0    0    0]
 [ 219  718    0    0    0    0    0]
 [  38   76    0    0    0    0    0]
 [  25   39    0    0    0    0    0]
 [  21   14    0    0    0    0    0]
 [  10    5    0    0    0    0    0]]
TRAIN_SIZE 33466
[[1886 1234    0    0    0    0    0]
 [1218 2655    0    0    0    0    0]
 [ 227  791    0    0    0    0    0]
 [  62  108    0 

  if self.run_code(code, result):


TRAIN_SIZE 20697
[[1408  821    0    0    0    0    0]
 [ 887 1422    0    0    0    0    0]
 [ 199  326    0    0    0    0    0]
 [  27   34    0    0    0    0    0]
 [  20   12    0    0    0    0    0]
 [   8    4    0    0    0    0    0]
 [   4    2    0    0    0    0    0]]


  if self.run_code(code, result):


TRAIN_SIZE 19427
[[1425  744    0    0    0    0    0]
 [ 855 1187    0    0    0    0    0]
 [ 199  287    0    0    0    0    0]
 [  41   36    0    0    0    0    0]
 [  34   10    0    0    0    0    0]
 [  21    8    0    0    0    0    0]
 [   8    2    0    0    0    0    0]]
TRAIN_SIZE 23589
[[1532  899    0    0    0    0    0]
 [1069 1428    0    0    0    0    0]
 [ 289  406    0    0    0    0    0]
 [  64   57    0    0    0    0    0]
 [  55   27    0    0    0    0    0]
 [  38   13    0    0    0    0    0]
 [  17    3    0    0    0    0    0]]
TRAIN_SIZE 30087
[[2060 1213    0    0    0    0    0]
 [1381 1947    0    0    0    0    0]
 [ 248  371    0    0    0    0    0]
 [  64   57    0    0    0    0    0]
 [  52   29    0    0    0    0    0]
 [  39   13    0    0    0    0    0]
 [  37   11    0    0    0    0    0]]
TRAIN_SIZE 29607
[[2138 1215    0    0    0    0    0]
 [1444 1870    0    0    0    0    0]
 [ 234  286    0    0    0    0    0]
 [  61   39    0 

  if self.run_code(code, result):


TRAIN_SIZE 28286
[[1942 1312    0    0    0    0    0]
 [1121 1936    0    0    0    0    0]
 [ 212  355    0    0    0    0    0]
 [  40   47    0    0    0    0    0]
 [  24   21    0    0    0    0    0]
 [  20   14    0    0    0    0    0]
 [  25    3    0    0    0    0    0]]
TRAIN_SIZE 25471
[[2405  924    0    0    0    0    0]
 [1176 1192    0    0    0    0    0]
 [ 250  194    0    0    0    0    0]
 [  54   47    0    0    0    0    0]
 [  38   22    0    0    0    0    0]
 [  34   12    0    0    0    0    0]
 [  18    2    0    0    0    0    0]]


  if self.run_code(code, result):


TRAIN_SIZE 22231
[[2259  753    0    0    0    0    0]
 [ 998  938    0    0    0    0    0]
 [ 198  272    0    0    0    0    0]
 [  30   33    0    0    0    0    0]
 [  15   19    0    0    0    0    0]
 [  17    3    0    0    0    0    0]
 [  21    2    0    0    0    0    0]]


  if self.run_code(code, result):


TRAIN_SIZE 26642
[[2539  438    0    0    0    0    0]
 [1802  950    0    0    0    0    0]
 [ 326  297    0    0    0    0    0]
 [  90   55    0    0    0    0    0]
 [  56   23    0    0    0    0    0]
 [  37    9    0    0    0    0    0]
 [  36    2    0    0    0    0    0]]


  if self.run_code(code, result):


TRAIN_SIZE 30328
[[2229 1241    0    0    0    0    0]
 [1355 1621    0    0    0    0    0]
 [ 394  376    0    0    0    0    0]
 [  89   63    0    0    0    0    0]
 [  54   40    0    0    0    0    0]
 [  55   24    0    0    0    0    0]
 [  32    9    0    0    0    0    0]]
TRAIN_SIZE 28488
[[2725  590    0    0    0    0    0]
 [1839  967    0    0    0    0    0]
 [ 310  420    0    0    0    0    0]
 [  64   64    0    0    0    0    0]
 [  46   22    0    0    0    0    0]
 [  32   13    0    0    0    0    0]
 [  29    1    0    0    0    0    0]]
TRAIN_SIZE 23106
[[1858  786    0    0    0    0    0]
 [1246 1134    0    0    0    0    0]
 [ 274  264    0    0    0    0    0]
 [  42   42    0    0    0    0    0]
 [  25   20    0    0    0    0    0]
 [  29   25    0    0    0    0    0]
 [  21   10    0    0    0    0    0]]


  if self.run_code(code, result):


TRAIN_SIZE 26587
[[2281  763    0    0    0    0    0]
 [1267 1321    0    0    0    0    0]
 [ 230  482    0    0    0    0    0]
 [  62   81    0    0    0    0    0]
 [  45   43    0    0    0    0    0]
 [  23   26    0    0    0    0    0]
 [  15    8    0    0    0    0    0]]
TRAIN_SIZE 23578
[[1315 1141    0    0    0    0    0]
 [ 894 1615    0    0    0    0    0]
 [ 261  428    0    0    0    0    0]
 [  52   68    0    0    0    0    0]
 [  24   42    0    0    0    0    0]
 [  26   17    0    0    0    0    0]
 [  11    1    0    0    0    0    0]]
TRAIN_SIZE 46797
[[1028 2969    0    0    0    0    0]
 [ 617 5039    0    0    0    0    0]
 [ 100 1414    0    0    0    0    0]
 [  14  225    0    0    0    0    0]
 [   5  134    0    0    0    0    0]
 [   3  100    0    0    0    0    0]
 [   1   50    0    0    0    0    0]]
TRAIN_SIZE 40771
[[1517 2588    0    0    0    0    0]
 [1295 3291    0    0    0    0    0]
 [ 264  821    0    0    0    0    0]
 [  46  119    0 

In [48]:
with open('./wildfires.csv') as csvfile:
    df = pd.read_csv(csvfile)
    pairs = df[['STAT_CAUSE_DESCR','FIRE_SIZE_CLASS']].drop_duplicates()
    res = []
    for _, x in pairs.iterrows(): 
        cause, size_class = x['STAT_CAUSE_DESCR'], x['FIRE_SIZE_CLASS']
        count = len(df[(df.STAT_CAUSE_DESCR==cause)&(df.FIRE_SIZE_CLASS==size_class)])
        res.append([cause, size_class, count])
    new_df = pd.DataFrame(data=res, columns=['cause','size_class','count'])
    with open('./cause_to_size.csv', "w") as csvtowrite:
        new_df.to_csv(csvtowrite, index=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [49]:
with open('./wildfires.csv') as csvfile:
    df = pd.read_csv(csvfile)
    pairs = df[['FIRE_SIZE_CLASS','STATE']].drop_duplicates()
    res = []
    for _, x in pairs.iterrows(): 
        size_class, state = x['FIRE_SIZE_CLASS'], x['STATE']
        count = len(df[(df.FIRE_SIZE_CLASS==size_class)&(df.STATE==state)])
        res.append([size_class, state, count])
    new_df = pd.DataFrame(data=res, columns=['size_class','state','count'])
    with open('./size_to_state.csv', "w") as csvtowrite:
        new_df.to_csv(csvtowrite, index=False)

  interactivity=interactivity, compiler=compiler, result=result)
