In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mgrs
import sqlite3
import math
import mgrs
from itertools import product
import statistics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
# filter out winter months where fires not likely
MONTHS_USED = ['April', 'May', 'June', 'July', 'August', 'September', 'October','November']

# limit to only the MGRS locations in Southern CA
MGRS_100KM_USED = ['11SMT','11SNT','11SNS','11SMS','11SLT']

# filter out class A fires, which are less than 1/4 acre
CLASSES_USED = ['B','C','D','E','F','G']

def missingByCol(df): #count missing values by column
    for col in df.columns:
        if df[col].isna().sum() != 0:
            print(col, df[col].isna().sum())

'''
Run classification on data set with chosen clfs for each combination of hypers
Store results in a sorted list
Print all results
'''
def clfRun(a_clf, data, clf_hyper={}):
  M, L, n_folds = data # unpack data container
  kf = KFold(n_splits=n_folds) # Establish the cross validation
  ret = {} # classic explication of results
 
  for ids, (train_index, test_index) in enumerate(kf.split(M, L)):
       
    clf = a_clf(**clf_hyper) # unpack parameters into clf if they exist
 
    clf.fit(M[train_index], L[train_index])
 
    pred = clf.predict(M[test_index])
 
    ret[ids]= {'clf': clf,
              'train_index': train_index,
              'test_index': test_index,
              'accuracy': accuracy_score(L[test_index], pred)}
 
  return ret
 
def clfHypers(clfsList):
    ret_hyper = dict()
    for clf in clfsList:
        clfString = str(clf)  # Check if values in clfsList are in clf_hyperict
        for k1, v1 in clf_hyper.items():  # go through first level of clf_hyperict
            if k1 in clfString:            # if clfString1 matches first level
                ret_hyper[clf] = [dict(zip(v1, s))
                                  for s in product(*v1.values())]
    return ret_hyper
 
def clfGridSearch(X, y, n_folds):
      # suppress future warnings
      with warnings.catch_warnings():
            # ignore all caught warnings
            warnings.filterwarnings("ignore")
 
      # n_folds = 5
 
      data = (X, y, n_folds)


      hyper_param_dict = clfHypers(clfsList) 
 
      # run grid search to find best hyper params
      clfsAccuracyDict = {}
      results = {}
      for clfs in clfsList:
            for i in hyper_param_dict[clfs]:
                 
                  clf_hyper = i
 
                  results = clfRun(clfs, data, clf_hyper)
 
                  for key in results:
                        k = results[key]['clf']
                        v = results[key]['accuracy']
 
                        kTest = str(k)
 
                        kTest = kTest.replace('         ', ' ')
                        kTest = kTest.replace('        ', ' ')
 
                        if kTest in clfsAccuracyDict:
                             
                              clfsAccuracyDict[kTest].append(v)
                        else:
                              clfsAccuracyDict[kTest] = [v]
           
      clfsAccuracyList_sorted = sorted(clfsAccuracyDict.items(), key = lambda item: statistics.mean(item[1]), reverse = True)
 
      for i in clfsAccuracyList_sorted:
            print('\nClassifier with Parameters:', i[0], '\nMeanAccuracy', statistics.mean(i[1]))

def clfResults(clf, X_train, y_train, X_test, y_test):
    X_test_scaled = scaler.transform(X_test)
    clfTmp = clf.fit(X_train, y_train) #fit model on training data
    y_pred = clfTmp.predict(X_test_scaled) #produce predictions (0/1) on test data
    y_pred_prob = clfTmp.predict_proba(X_test_scaled) #produce prediction probabilites (0 <= p <= 1) on test data
    y_pred_prob = y_pred_prob[:, 1] # only take second column.  first column is inverse prob
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel() #get confusion matrix on test data
    total_obs = (tp + tn + fp + fn)
   # print(tn, fp, fn, tp)
 
 
    clfAcc = (tp + tn) / (tp + tn + fp + fn) # accuracy of entire model
    clfPrecision = tp / (tp + fp) # precision is the ability of the classifier not to label as positive a sample that is negative
    clfRecall = tp / (tp + fn) # recall is the ability of the classifier to find all the positive samples
 
    # return(clfAcc, clfPrecision, clfRecall)
    print('-----------------------Confusion Matrix------------------------'
        , '\n  Predicted|--------------------Actual------------------------'
        , '\n           |        Yes            |         No                |   Total   '
        , '\n--------------------------------------------------------------'
        , '\n        Yes|        '+ str(tp)+'            |        '+str(fp)+'               |    '+str(tp+fp)
        , '\n--------------------------------------------------------------'
        , '\n         No|        '+ str(fn)+'             |        '+str(tn)+'              |    '+str(fn+tn)
        , '\n--------------------------------------------------------------'
        , '\n      Total|        '+ str(tp+fn)+'            |        '+str(fp+tn)+'              |    '+str(total_obs)
        , '\n'
        )
    # print('Test Total Obs: ', total_obs)
    print('Accuracy: ', np.round(clfAcc,6), '\nPrecision: ', np.round(clfPrecision,6), '\nRecall: ', np.round(clfRecall,6))

Import Data: Need to copy and paste sqllite file from Kaggle into local drive path.
FPA_FOD_20170508.sqlite https://www.kaggle.com/rtatman/188-million-us-wildfires             
        
Metadata:   
Fires - Target data - Kaggle        
Time - Calendar data - Omits Dec through March (non fire season)      
Location - MGRS - Military Grid Ref. Sys. with lat/lon                      
Weather - MeteoMatics - Precip, Temp, sunshine, and drought data

Get Fire Data

In [4]:
conn = sqlite3.connect('C:/Users/balso/Downloads/FPA_FOD_20170508.sqlite')
ca_fires = pd.read_sql_query('select * from fires where State = "CA";', conn) # 

m = mgrs.MGRS() #mgrs API
to_mgrs_10km = []  #precision 1
to_mgrs_100km = []  #precision 0
to_mgrs_lat_10km = []
to_mgrs_lon_10km = []

ca_fires_data = pd.DataFrame()

ca_fires_data['object_id'] = ca_fires['OBJECTID']
ca_fires_data['reporting_unit'] = ca_fires['NWCG_REPORTING_UNIT_NAME']
ca_fires_data['fire_date_num'] = ca_fires['DISCOVERY_DATE']
ca_fires_data['fire_date_cal'] = pd.to_datetime(ca_fires['DISCOVERY_DATE'], unit='D', origin='julian') #convert numeric to calendar date
ca_fires_data['fire_year'] = ca_fires['FIRE_YEAR']
ca_fires_data['fire_month'] = ca_fires_data['fire_date_cal'].dt.month_name()
ca_fires_data['fire_day_of_week'] = ca_fires_data['fire_date_cal'].dt.day_name()
ca_fires_data['fire_class'] = ca_fires['FIRE_SIZE_CLASS']
ca_fires_data['lat'] = ca_fires['LATITUDE']
ca_fires_data['lon'] = ca_fires['LONGITUDE']

for i, j in zip(ca_fires_data['lat'].tolist(), ca_fires_data['lon'].tolist()):
    to_mgrs_10km.append(m.toMGRS(i, j, MGRSPrecision=1)) 
    to_mgrs_100km.append(m.toMGRS(i, j, MGRSPrecision=0)) 

ca_fires_data['mgrs_10km'] = to_mgrs_10km
ca_fires_data['mgrs_100km'] = to_mgrs_100km

for k in ca_fires_data['mgrs_10km'].tolist():
    to_mgrs_lat_10km.append(m.toLatLon(k)[0])
    to_mgrs_lon_10km.append(m.toLatLon(k)[1])

ca_fires_data['mgrs_lat_10km'] = to_mgrs_lat_10km
ca_fires_data['mgrs_lon_10km'] = to_mgrs_lon_10km

# filter out winter months where fires not likely
ca_fires_data = ca_fires_data[ca_fires_data.fire_month.isin(MONTHS_USED)]

# filter out class A fires, which are less than 1/4 acre
ca_fires_data = ca_fires_data[ca_fires_data.fire_class.isin(CLASSES_USED)]

# filter out fires before 2000 since weather data is sparse before year 2000
ca_fires_data = ca_fires_data[ca_fires_data.fire_year >= 2000]

# limit to only the MGRS locations in Southern CA
socal_fires_data = ca_fires_data[ca_fires_data.mgrs_100km.isin(MGRS_100KM_USED)]

# keep only necessary columns
socal_lite = pd.DataFrame()
socal_lite['mgrs_100km'] = socal_fires_data['mgrs_100km']
socal_lite['mgrs_10km'] = socal_fires_data['mgrs_10km']
socal_lite['lat'] = socal_fires_data['mgrs_lat_10km']
socal_lite['lon'] = socal_fires_data['mgrs_lon_10km']
socal_lite['date'] = socal_fires_data['fire_date_cal']
# socal_lite = socal_lite.groupby(level=0).first()
# socal_lite.mgrs_10km.unique().shape
socal_lite.reset_index(drop=True, inplace = True)
socal_lite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8057 entries, 0 to 8056
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   mgrs_100km  8057 non-null   object        
 1   mgrs_10km   8057 non-null   object        
 2   lat         8057 non-null   float64       
 3   lon         8057 non-null   float64       
 4   date        8057 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 314.9+ KB


In [5]:
socal_lite

Unnamed: 0,mgrs_100km,mgrs_10km,lat,lon,date
0,11SMT,11SMT27,34.067686,-117.866966,2005-07-03
1,11SMT,11SMT53,33.708802,-117.539594,2005-06-19
2,11SNS,11SNS16,33.078549,-116.892857,2005-07-31
3,11SNS,11SNS23,32.807799,-116.786366,2005-07-04
4,11SNS,11SNS08,33.258997,-117.000000,2005-07-05
...,...,...,...,...,...
8052,11SNS,11SNS12,32.717728,-116.893290,2015-04-09
8053,11SMT,11SMT95,33.890318,-117.108147,2009-05-03
8054,11SNS,11SNS04,32.898187,-117.000000,2010-05-29
8055,11SNS,11SNS04,32.898187,-117.000000,2014-05-29


Import Weather Data

In [6]:
weather = pd.read_csv('2000to2015_Weather_v2.csv')
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249940 entries, 0 to 2249939
Data columns (total 14 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   Unnamed: 0                     int64  
 1   lat                            float64
 2   lon                            float64
 3   t_mean_2m_24h:F                float64
 4   t_min_2m_24h:F                 float64
 5   t_max_2m_24h:F                 float64
 6   precip_24h:mm                  float64
 7   sunshine_duration_24h:min      float64
 8   drought_index:idx              float64
 9   soil_moisture_index_-15cm:idx  float64
 10  soil_type:idx                  float64
 11  wind_speed_2m:mph              float64
 12  elevation:m                    float64
 13  date                           object 
dtypes: float64(12), int64(1), object(1)
memory usage: 240.3+ MB


In [7]:
# m = mgrs.MGRS() #mgrs API
# to_mgrs_10km = []  #precision 1
# to_mgrs_100km = []  #precision 0

# for i, j in zip(weather['lat'].tolist(), weather['lon'].tolist()):
#     to_mgrs_10km.append(m.toMGRS(i, j, MGRSPrecision=1)) 
#     to_mgrs_100km.append(m.toMGRS(i, j, MGRSPrecision=0)) # use mgrs API to convert lat/lon to MGRS conventions

# weather['mgrs_10km'] = to_mgrs_10km
# weather['mgrs_100km'] = to_mgrs_100km

In [8]:
# print('Number of 100km stations ',len(set(weather['mgrs_100km'].unique())))
# print('Number of 10km stations ',len(set(weather['mgrs_10km'].unique())))

Weather Data Preprocessing

In [9]:
weather = weather[['date','lat','lon','t_mean_2m_24h:F','t_min_2m_24h:F','t_max_2m_24h:F','precip_24h:mm','sunshine_duration_24h:min','drought_index:idx','wind_speed_2m:mph','elevation:m']]
weather = weather.rename(columns={'t_mean_2m_24h:F':'tavg','t_min_2m_24h:F':'tmin','t_max_2m_24h:F':'tmax','precip_24h:mm':'prcp','sunshine_duration_24h:min':'sun','drought_index:idx':'drought_index','wind_speed_2m:mph':'wind_speed','elevation:m':'elevation'})
weather.date = pd.to_datetime(pd.to_datetime(weather['date']).dt.date)
# weather.date = weather.date.dt.date
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2249940 entries, 0 to 2249939
Data columns (total 11 columns):
 #   Column         Dtype         
---  ------         -----         
 0   date           datetime64[ns]
 1   lat            float64       
 2   lon            float64       
 3   tavg           float64       
 4   tmin           float64       
 5   tmax           float64       
 6   prcp           float64       
 7   sun            float64       
 8   drought_index  float64       
 9   wind_speed     float64       
 10  elevation      float64       
dtypes: datetime64[ns](1), float64(10)
memory usage: 188.8 MB


In [10]:
len(set(weather['date'].unique())) #should be 5844 - number of days from 1/1/2000 - 12/31/2015

5844

In [11]:
# rolling averages for prcp, tmax, tmin for 3, 7, and 10 days

#prcp
weather['prcp_rolling_3day'] = weather.groupby(['lat','lon'])['prcp'].transform(lambda x: x.rolling(3).mean())
weather['prcp_rolling_3day'] = round(weather['prcp_rolling_3day'],6)

weather['prcp_rolling_7day'] = weather.groupby(['lat','lon'])['prcp'].transform(lambda x: x.rolling(7).mean())
weather['prcp_rolling_7day'] = round(weather['prcp_rolling_7day'],6)

weather['prcp_rolling_14day'] = weather.groupby(['lat','lon'])['prcp'].transform(lambda x: x.rolling(14).mean())
weather['prcp_rolling_14day'] = round(weather['prcp_rolling_14day'],6)

#tavg
weather['tavg_rolling_3day'] = weather.groupby(['lat','lon'])['tavg'].transform(lambda x: x.rolling(3).mean())
weather['tavg_rolling_3day'] = round(weather['tavg_rolling_3day'],6)

weather['tavg_rolling_7day'] = weather.groupby(['lat','lon'])['tavg'].transform(lambda x: x.rolling(7).mean())
weather['tavg_rolling_7day'] = round(weather['tavg_rolling_7day'],6)

weather['tavg_rolling_14day'] = weather.groupby(['lat','lon'])['tavg'].transform(lambda x: x.rolling(14).mean())
weather['tavg_rolling_14day'] = round(weather['tavg_rolling_14day'],6)

#tmin
weather['tmin_rolling_3day'] = weather.groupby(['lat','lon'])['tmin'].transform(lambda x: x.rolling(3).mean())
weather['tmin_rolling_3day'] = round(weather['tmin_rolling_3day'],6)

weather['tmin_rolling_7day'] = weather.groupby(['lat','lon'])['tmin'].transform(lambda x: x.rolling(7).mean())
weather['tmin_rolling_7day'] = round(weather['tmin_rolling_7day'],6)

weather['tmin_rolling_14day'] = weather.groupby(['lat','lon'])['tmin'].transform(lambda x: x.rolling(14).mean())
weather['tmin_rolling_14day'] = round(weather['tmin_rolling_14day'],6)

#tmax
weather['tmax_rolling_3day'] = weather.groupby(['lat','lon'])['tmax'].transform(lambda x: x.rolling(3).mean())
weather['tmax_rolling_3day'] = round(weather['tmax_rolling_3day'],6)

weather['tmax_rolling_7day'] = weather.groupby(['lat','lon'])['tmax'].transform(lambda x: x.rolling(7).mean())
weather['tmax_rolling_7day'] = round(weather['tmax_rolling_7day'],6)

weather['tmax_rolling_14day'] = weather.groupby(['lat','lon'])['tmax'].transform(lambda x: x.rolling(14).mean())
weather['tmax_rolling_14day'] = round(weather['tmax_rolling_14day'],6)

#sun
weather['sun_rolling_3day'] = weather.groupby(['lat','lon'])['sun'].transform(lambda x: x.rolling(3).mean())
weather['sun_rolling_3day'] = round(weather['sun_rolling_3day'],6)

weather['sun_rolling_7day'] = weather.groupby(['lat','lon'])['sun'].transform(lambda x: x.rolling(7).mean())
weather['sun_rolling_7day'] = round(weather['sun_rolling_7day'],6)

weather['sun_rolling_14day'] = weather.groupby(['lat','lon'])['sun'].transform(lambda x: x.rolling(14).mean())
weather['sun_rolling_14day'] = round(weather['sun_rolling_14day'],6)

#windspeed
weather['wind_speed_3day'] = weather.groupby(['lat','lon'])['wind_speed'].transform(lambda x: x.rolling(3).mean())
weather['wind_speed_3day'] = round(weather['wind_speed_3day'],6)

weather['wind_speed_7day'] = weather.groupby(['lat','lon'])['wind_speed'].transform(lambda x: x.rolling(7).mean())
weather['wind_speed_7day'] = round(weather['wind_speed_7day'],6)

weather['wind_speed_14day'] = weather.groupby(['lat','lon'])['wind_speed'].transform(lambda x: x.rolling(14).mean())
weather['wind_speed_14day'] = round(weather['wind_speed_14day'],6)

In [12]:
# c = {'lat':'lat_6', 'lon':'lon_6'}
# weather = weather.rename(columns = c)

In [13]:
weather = weather[weather.date.dt.month_name().isin(MONTHS_USED)]
weather.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1503040 entries, 91 to 2249908
Data columns (total 29 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   date                1503040 non-null  datetime64[ns]
 1   lat                 1503040 non-null  float64       
 2   lon                 1503040 non-null  float64       
 3   tavg                1503040 non-null  float64       
 4   tmin                1503040 non-null  float64       
 5   tmax                1503040 non-null  float64       
 6   prcp                1503040 non-null  float64       
 7   sun                 1503040 non-null  float64       
 8   drought_index       1503040 non-null  float64       
 9   wind_speed          1503040 non-null  float64       
 10  elevation           1503040 non-null  float64       
 11  prcp_rolling_3day   1503040 non-null  float64       
 12  prcp_rolling_7day   1503040 non-null  float64       
 13  prcp_rollin

In [14]:
len(set(weather['date'].unique())) #should be 3904 - number of days from 1/1/2000 - 12/31/2015 excluding Dec - Mar.

3904

In [15]:
missingByCol(weather)

In [16]:
mgrs_final = pd.read_csv('mgrs_final.csv')
mgrs_final = mgrs_final[['mgrs_10km','lat_10km','lon_10km']]
mgrs_final.head()

Unnamed: 0,mgrs_10km,lat_10km,lon_10km
0,11SMT03,33.705267,-118.079125
1,11SMT04,33.795444,-118.080255
2,11SMT05,33.88562,-118.081391
3,11SMT06,33.975794,-118.082531
4,11SMT07,34.065966,-118.083677


In [17]:
# import math

mgrs_lat_6, mgrs_lon_6 = [], []

for i in mgrs_final['lat_10km']:
    # print(math.trunc(1000000 * i) / 1000000)
    mgrs_lat_6.append(math.trunc(1000000 * i) / 1000000)

for j in mgrs_final['lon_10km']:
    # print(float('%.6f'%(i)))
    mgrs_lon_6.append(math.trunc(1000000 * j) / 1000000)
# mgrs_final.lat_10km_6 = mgrs_final['lat_10km']

mgrs_final['lat_6'] = mgrs_lat_6
mgrs_final['lon_6'] = mgrs_lon_6
# mgrs_final.info()

In [18]:
mgrs_final.lat_6.unique()[:10]

array([33.705267, 33.795444, 33.885619, 33.975793, 34.065966, 34.156137,
       34.246307, 33.615981, 33.706162, 33.796342])

In [19]:
mgrs_final.lon_6.unique()[:10]

array([-118.079125, -118.080255, -118.08139 , -118.082531, -118.083676,
       -118.084827, -118.085983, -117.970213, -117.971226, -117.972244])

In [21]:
weather_lat_6, weather_lon_6 = [], []

for i in weather['lat']:
    # print(math.trunc(1000000 * i) / 1000000)
    weather_lat_6.append(math.trunc(1000000 * i) / 1000000)

for j in weather['lon']:
    # print(float('%.6f'%(i)))
    weather_lon_6.append(math.trunc(1000000 * j) / 1000000)
# mgrs_final.lat_10km_6 = mgrs_final['lat_10km']

weather['lat_6'] = weather_lat_6
weather['lon_6'] = weather_lon_6

In [22]:
weather.lat_6.unique()[:10]

array([33.705267, 33.795444, 33.885619, 33.975793, 34.065966, 34.156137,
       34.246307, 33.615981, 33.706162, 33.796342])

In [23]:
weather.lon_6.unique()[:10]

array([-118.079125, -118.080255, -118.08139 , -118.082531, -118.083676,
       -118.084827, -118.085983, -117.970213, -117.971226, -117.972244])

In [24]:
weather2 = weather.merge(mgrs_final, on = ('lat_6','lon_6'), how = 'inner')
weather2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1499136 entries, 0 to 1499135
Data columns (total 34 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   date                1499136 non-null  datetime64[ns]
 1   lat                 1499136 non-null  float64       
 2   lon                 1499136 non-null  float64       
 3   tavg                1499136 non-null  float64       
 4   tmin                1499136 non-null  float64       
 5   tmax                1499136 non-null  float64       
 6   prcp                1499136 non-null  float64       
 7   sun                 1499136 non-null  float64       
 8   drought_index       1499136 non-null  float64       
 9   wind_speed          1499136 non-null  float64       
 10  elevation           1499136 non-null  float64       
 11  prcp_rolling_3day   1499136 non-null  float64       
 12  prcp_rolling_7day   1499136 non-null  float64       
 13  prcp_rolling

In [25]:
len(weather2.mgrs_10km.unique())

384

In [149]:
# weather = weather[weather.mgrs_100km.isin(MGRS_100KM_USED)]
# weather.info()
# print(len(weather.mgrs_10km.unique()))
# weather.mgrs_100km.unique()

In [70]:
# weather.head(15)

Merge Weather and Fire Data

In [26]:
data = weather2.merge(socal_lite, on = ('mgrs_10km','date'), how = 'left')
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1499549 entries, 0 to 1499548
Data columns (total 37 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   date                1499549 non-null  datetime64[ns]
 1   lat_x               1499549 non-null  float64       
 2   lon_x               1499549 non-null  float64       
 3   tavg                1499549 non-null  float64       
 4   tmin                1499549 non-null  float64       
 5   tmax                1499549 non-null  float64       
 6   prcp                1499549 non-null  float64       
 7   sun                 1499549 non-null  float64       
 8   drought_index       1499549 non-null  float64       
 9   wind_speed          1499549 non-null  float64       
 10  elevation           1499549 non-null  float64       
 11  prcp_rolling_3day   1499549 non-null  float64       
 12  prcp_rolling_7day   1499549 non-null  float64       
 13  prcp_rolling

In [27]:
data = data.drop(columns = ['mgrs_10km','lat_y','lon_y','lat_x','lon_x','lat_6','lon_6','lat_10km','lon_10km'])
# data = data[[mgrs_10km,]]
data['day_of_year'] = data.date.dt.dayofyear
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1499549 entries, 0 to 1499548
Data columns (total 29 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   date                1499549 non-null  datetime64[ns]
 1   tavg                1499549 non-null  float64       
 2   tmin                1499549 non-null  float64       
 3   tmax                1499549 non-null  float64       
 4   prcp                1499549 non-null  float64       
 5   sun                 1499549 non-null  float64       
 6   drought_index       1499549 non-null  float64       
 7   wind_speed          1499549 non-null  float64       
 8   elevation           1499549 non-null  float64       
 9   prcp_rolling_3day   1499549 non-null  float64       
 10  prcp_rolling_7day   1499549 non-null  float64       
 11  prcp_rolling_14day  1499549 non-null  float64       
 12  tavg_rolling_3day   1499549 non-null  float64       
 13  tavg_rolling

In [28]:
is_fire = []
for i in data['mgrs_100km']:
    # print(pd.isnull(i))
    if pd.isnull(i) == True:
        is_fire.append(int(0))
    else:
        is_fire.append(int(1))
# np.sum(is_fire)
data['is_fire'] = is_fire
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1499549 entries, 0 to 1499548
Data columns (total 30 columns):
 #   Column              Non-Null Count    Dtype         
---  ------              --------------    -----         
 0   date                1499549 non-null  datetime64[ns]
 1   tavg                1499549 non-null  float64       
 2   tmin                1499549 non-null  float64       
 3   tmax                1499549 non-null  float64       
 4   prcp                1499549 non-null  float64       
 5   sun                 1499549 non-null  float64       
 6   drought_index       1499549 non-null  float64       
 7   wind_speed          1499549 non-null  float64       
 8   elevation           1499549 non-null  float64       
 9   prcp_rolling_3day   1499549 non-null  float64       
 10  prcp_rolling_7day   1499549 non-null  float64       
 11  prcp_rolling_14day  1499549 non-null  float64       
 12  tavg_rolling_3day   1499549 non-null  float64       
 13  tavg_rolling

In [29]:
data.drop(['date','mgrs_100km'], axis = 1,inplace = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1499549 entries, 0 to 1499548
Data columns (total 28 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   tavg                1499549 non-null  float64
 1   tmin                1499549 non-null  float64
 2   tmax                1499549 non-null  float64
 3   prcp                1499549 non-null  float64
 4   sun                 1499549 non-null  float64
 5   drought_index       1499549 non-null  float64
 6   wind_speed          1499549 non-null  float64
 7   elevation           1499549 non-null  float64
 8   prcp_rolling_3day   1499549 non-null  float64
 9   prcp_rolling_7day   1499549 non-null  float64
 10  prcp_rolling_14day  1499549 non-null  float64
 11  tavg_rolling_3day   1499549 non-null  float64
 12  tavg_rolling_7day   1499549 non-null  float64
 13  tavg_rolling_14day  1499549 non-null  float64
 14  tmin_rolling_3day   1499549 non-null  float64
 15  tmin_rolling_7d

In [30]:
data.head()

Unnamed: 0,tavg,tmin,tmax,prcp,sun,drought_index,wind_speed,elevation,prcp_rolling_3day,prcp_rolling_7day,...,tmax_rolling_7day,tmax_rolling_14day,sun_rolling_3day,sun_rolling_7day,sun_rolling_14day,wind_speed_3day,wind_speed_7day,wind_speed_14day,day_of_year,is_fire
0,65.1,51.7,79.0,0.0,751.9,0.0,11.1,0.0,0.0,0.0,...,66.914286,66.557143,663.1,527.585714,594.45,7.266667,6.414286,6.685714,92,0
1,64.7,54.1,76.2,0.0,753.6,0.0,6.1,0.0,0.0,0.0,...,68.485714,67.142857,749.4,540.042857,602.242857,7.633333,6.228571,6.65,93,0
2,61.8,53.0,71.2,0.0,738.8,0.0,5.1,0.0,0.0,0.0,...,69.785714,67.357143,748.1,574.242857,608.907143,7.433333,6.1,6.6,94,0
3,62.0,52.3,75.8,0.0,629.5,0.0,9.4,0.0,0.0,0.0,...,71.942857,67.85,707.3,621.528571,602.2,6.866667,6.742857,6.721429,95,0
4,59.9,53.9,69.1,0.0,529.7,0.0,5.6,0.0,0.0,0.0,...,73.185714,67.921429,632.666667,662.985714,587.75,6.7,6.857143,6.585714,96,0


In [31]:
X = data.copy()
y = X['is_fire']
X.drop(['is_fire'], axis = 1, inplace = True)
print(X.shape, y.shape)

(1499549, 27) (1499549,)


In [32]:
(X_train, X_test, y_train, y_test) = train_test_split(X,y,test_size = 0.33, random_state = 8)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1004697, 27) (494852, 27) (1004697,) (494852,)


In [33]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_train_scaled.describe()

Unnamed: 0,tavg,tmin,tmax,prcp,sun,drought_index,wind_speed,elevation,prcp_rolling_3day,prcp_rolling_7day,...,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day,sun_rolling_3day,sun_rolling_7day,sun_rolling_14day,wind_speed_3day,wind_speed_7day,wind_speed_14day,day_of_year
count,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,...,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0,1004697.0
mean,-1.795451e-15,-5.458639e-16,6.171873e-16,5.026321e-15,-5.503174e-16,2.960698e-15,6.68989e-16,2.79328e-17,2.428461e-16,-2.274796e-16,...,1.807853e-16,5.004247e-16,2.623294e-15,-9.297078e-16,-1.02249e-15,-4.045249e-16,1.367383e-16,-1.532039e-15,-1.872709e-15,1.351461e-16
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-4.925105,-5.069672,-4.401697,-0.1762145,-4.109702,-1.36402,-2.331338,-1.169357,-0.2570019,-0.363554,...,-4.477181,-4.216882,-4.098447,-4.654366,-4.959122,-4.417495,-2.722938,-2.888732,-2.845245,-1.728288
25%,-0.6971439,-0.6559366,-0.741149,-0.1762145,-0.362551,-0.977544,-0.6261727,-0.8369245,-0.2570019,-0.363554,...,-0.7483656,-0.7498138,-0.7512886,-0.5049689,-0.623471,-0.689681,-0.6384215,-0.6389264,-0.6361836,-0.8625356
50%,-0.04056645,0.0415425,-0.02182735,-0.1762145,0.2129878,0.1818839,-0.006112418,-0.2345054,-0.2570019,-0.3534683,...,-0.03065119,-0.04280376,-0.05682541,0.169844,0.1225099,0.0885939,-0.03391147,-0.03556965,-0.04673277,0.003216603
75%,0.6458555,0.6518367,0.7134793,-0.1762145,0.7331864,0.9548358,0.5106044,0.5552514,-0.2052965,-0.1431091,...,0.7090622,0.6963431,0.6954026,0.7620025,0.7641744,0.7686792,0.5497529,0.5575613,0.5537352,0.8689688
max,3.859106,4.629647,3.470879,36.69386,1.221411,1.727788,23.96955,4.465466,23.96477,19.04861,...,3.491924,3.3946,3.443731,1.415139,1.633913,1.815813,21.0197,18.27989,14.10008,1.734721


In [34]:
# clfsList = [LogisticRegression, RandomForestClassifier, GradientBoostingClassifier]
clfsList = [LogisticRegression, RandomForestClassifier, GradientBoostingClassifier]

clf_hyper = { 
         'LogisticRegression':{
            'tol': [0.01, 0.1, 1.0]
           ,'C': [0.01, 0.1, 1.0]
           , 'solver': ['sag','saga']
           , 'class_weight': [None,'balanced']
        }
        ,
         'RandomForestClassifier':{
            'n_estimators': [50,100,150]
           ,'min_samples_leaf': [1,5,15]
           ,'class_weight': [None, 'balanced']
           ,'bootstrap': [True]
        }        
        # ,
        # 'GradientBoostingClassifier':{
        #     'learning_rate': [0.001,0.01,0.1]
        #    ,'n_estimators': [10,50,100]
        #    ,'min_samples_leaf': [5,3,1]
        # }
}

In [307]:
# clfGridSearch(np.array(X_train_scaled), np.array(y_train), 5)

In [302]:
clf_0 = RandomForestClassifier(class_weight = 'balanced')
clf_0.fit(X_train_scaled, y_train)

RandomForestClassifier(class_weight='balanced')

In [303]:
clfResults(clf_0, X_train_scaled, y_train, X_test, y_test)

-----------------------Confusion Matrix------------------------ 
  Predicted|--------------------Actual------------------------ 
           |        Yes            |         No                |   Total    
-------------------------------------------------------------- 
        Yes|        183            |        0               |    183 
-------------------------------------------------------------- 
         No|        2507             |        492162              |    494669 
-------------------------------------------------------------- 
      Total|        2690            |        492162              |    494852 

Accuracy:  0.994934 
Precision:  1.0 
Recall:  0.06803


In [304]:
importance0 = [clf_0.feature_importances_]

clf_0_importance = pd.DataFrame(importance0, columns = X_train.columns).T

clf_0_importance.columns = ['coeff']

clf_0_importance.sort_values(by = 'coeff', ascending = False)

Unnamed: 0,coeff
elevation,0.104115
wind_speed_14day,0.050244
tmax,0.050058
tmax_rolling_3day,0.045804
wind_speed_7day,0.043873
tmax_rolling_7day,0.043785
sun,0.042679
day_of_year,0.04247
tmax_rolling_14day,0.041231
tavg_rolling_14day,0.040654


In [250]:
# data.unique()

In [66]:
# weather['soil_type:idx'].unique()

In [67]:
# weather['soil_moisture_index_-15cm:idx'].unique()

In [68]:
# import missingno as msno

In [69]:
# Visualize missing values as a matrix
# msno.matrix(weather)