In [1]:
!pip install mgrs

Collecting mgrs
[?25l  Downloading https://files.pythonhosted.org/packages/e4/16/d7f14c8e92da53798396428d715ba161bfbff1f7c2db0ec9d796e1354308/mgrs-1.4.0-cp37-cp37m-manylinux1_x86_64.whl (47kB)
[K     |███████                         | 10kB 18.7MB/s eta 0:00:01[K     |█████████████▉                  | 20kB 27.7MB/s eta 0:00:01[K     |████████████████████▊           | 30kB 34.4MB/s eta 0:00:01[K     |███████████████████████████▋    | 40kB 20.3MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 2.7MB/s 
Installing collected packages: mgrs
Successfully installed mgrs-1.4.0


In [2]:
import numpy as np
import pandas as pd
import sqlite3
import math
import mgrs
from itertools import product
import statistics
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def missingByCol(df): #count missing values by column
    for col in df.columns:
        if df[col].isna().sum() != 0:
            print(col, df[col].isna().sum())

def getCalTbl(start='2000-01-01', end='2015-12-31'):
    df = pd.DataFrame({'date': pd.date_range(start, end)})
    df['year'] = df.date.dt.year
    df['day_of_year'] = df.date.dt.dayofyear
    df['month'] = df.date.dt.month
    df['month_name'] = df.date.dt.month_name()
    df['day'] = df.date.dt.day
    df['day_of_week'] = df.date.dt.dayofweek
    df['day_of_week_name'] = df.date.dt.day_name()
    return df
    
'''
Run classification on data set with chosen clfs for each combination of hypers
Store results in a sorted list
Print all results
'''
def clfRun(a_clf, data, clf_hyper={}):
  M, L, n_folds = data # unpack data container
  kf = KFold(n_splits=n_folds) # Establish the cross validation
  ret = {} # classic explication of results
 
  for ids, (train_index, test_index) in enumerate(kf.split(M, L)):
       
    clf = a_clf(**clf_hyper) # unpack parameters into clf if they exist
 
    clf.fit(M[train_index], L[train_index])
 
    pred = clf.predict(M[test_index])
 
    ret[ids]= {'clf': clf,
              'train_index': train_index,
              'test_index': test_index,
              'accuracy': accuracy_score(L[test_index], pred)}
 
  return ret
 
def clfHypers(clfsList):
    ret_hyper = dict()
    for clf in clfsList:
        clfString = str(clf)  # Check if values in clfsList are in clf_hyperict
        for k1, v1 in clf_hyper.items():  # go through first level of clf_hyperict
            if k1 in clfString:            # if clfString1 matches first level
                ret_hyper[clf] = [dict(zip(v1, s))
                                  for s in product(*v1.values())]
    return ret_hyper
 
def clfGridSearch(X, y, n_folds):
      # suppress future warnings
      with warnings.catch_warnings():
            # ignore all caught warnings
            warnings.filterwarnings("ignore")
 
      # n_folds = 5
 
      data = (X, y, n_folds)


      hyper_param_dict = clfHypers(clfsList) 
 
      # run grid search to find best hyper params
      clfsAccuracyDict = {}
      results = {}
      for clfs in clfsList:
            for i in hyper_param_dict[clfs]:
                 
                  clf_hyper = i
 
                  results = clfRun(clfs, data, clf_hyper)
 
                  for key in results:
                        k = results[key]['clf']
                        v = results[key]['accuracy']
 
                        kTest = str(k)
 
                        kTest = kTest.replace('         ', ' ')
                        kTest = kTest.replace('        ', ' ')
 
                        if kTest in clfsAccuracyDict:
                             
                              clfsAccuracyDict[kTest].append(v)
                        else:
                              clfsAccuracyDict[kTest] = [v]
           
      clfsAccuracyList_sorted = sorted(clfsAccuracyDict.items(), key = lambda item: statistics.mean(item[1]), reverse = True)
 
      for i in clfsAccuracyList_sorted:
            print('\nClassifier with Parameters:', i[0], '\nMeanAccuracy', statistics.mean(i[1]))

def clfResults(clf, X_train, y_train, X_test, y_test):
 
    clfTmp = clf.fit(X_train, y_train) #fit model on training data
    y_pred = clfTmp.predict(X_test) #produce predictions (0/1) on test data
    y_pred_prob = clfTmp.predict_proba(X_test) #produce prediction probabilites (0 <= p <= 1) on test data
    y_pred_prob = y_pred_prob[:, 1] # only take second column.  first column is inverse prob
    tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel() #get confusion matrix on test data
    total_obs = (tp + tn + fp + fn)
   # print(tn, fp, fn, tp)
 
 
    clfAcc = (tp + tn) / (tp + tn + fp + fn) # accuracy of entire model
    clfPrecision = tp / (tp + fp) # precision is the ability of the classifier not to label as positive a sample that is negative
    clfRecall = tp / (tp + fn) # recall is the ability of the classifier to find all the positive samples
 
    # return(clfAcc, clfPrecision, clfRecall)
    print('-----------------------Confusion Matrix------------------------'
        , '\n  Predicted|--------------------Actual------------------------'
        , '\n           |        Yes            |         No                |   Total   '
        , '\n--------------------------------------------------------------'
        , '\n        Yes|        '+ str(tp)+'            |        '+str(fp)+'               |    '+str(tp+fp)
        , '\n--------------------------------------------------------------'
        , '\n         No|        '+ str(fn)+'             |        '+str(tn)+'              |    '+str(fn+tn)
        , '\n--------------------------------------------------------------'
        , '\n      Total|        '+ str(tp+fn)+'            |        '+str(fp+tn)+'              |    '+str(total_obs)
        , '\n'
        )
    # print('Test Total Obs: ', total_obs)
    print('Accuracy: ', np.round(clfAcc,6), '\nPrecision: ', np.round(clfPrecision,6), '\nRecall: ', np.round(clfRecall,6))

Import Data: Need to copy and paste sqllite file from Kaggle into local drive path.
FPA_FOD_20170508.sqlite https://www.kaggle.com/rtatman/188-million-us-wildfires             
        
Metadata:   
Fires - Target data - Kaggle        
Time - Calendar data - Omits Dec through March (non fire season)      
Location - MGRS - Military Grid Ref. Sys. with lat/lon                      
Weather - USGS - Precip and Temp only (for now)

# Get Fire Data

In [None]:
#conn = sqlite3.connect('C:/Users/balso/Downloads/FPA_FOD_20170508.sqlite')

In [4]:
# Imports

# To mount Google Drive and read/write files
from google.colab import drive

# read file from drive
#from google.colab import drive
drive.mount('/content/gdrive')
conn = sqlite3.connect('/content/gdrive/MyDrive/SMU/msds_7346_cloud/project/USWildfireAnalysis/data/FPA_FOD_20170508.sqlite')

Mounted at /content/gdrive


## Add MGRS Data to Fire Data

In [None]:


ca_fires = pd.read_sql_query('select * from fires where State = "CA";', conn) # 

m = mgrs.MGRS() #mgrs API
to_mgrs_1m = [] #precision 5
to_mgrs_10m = [] #precision 4
to_mgrs_100m = [] #precision 3
to_mgrs_1km = []  #precision 2
to_mgrs_10km = []  #precision 1
to_mgrs_100km = []  #precision 0
to_mgrs_lat_10km = []
to_mgrs_lon_10km = []
ca_fires_data = pd.DataFrame()

ca_fires_data['object_id'] = ca_fires['OBJECTID']
ca_fires_data['reporting_unit'] = ca_fires['NWCG_REPORTING_UNIT_NAME']
ca_fires_data['fire_date_num'] = ca_fires['DISCOVERY_DATE']
ca_fires_data['fire_date_cal'] = pd.to_datetime(ca_fires['DISCOVERY_DATE'], unit='D', origin='julian') #convert numeric to calendar date
ca_fires_data['fire_year'] = ca_fires['FIRE_YEAR']
ca_fires_data['fire_month'] = ca_fires_data['fire_date_cal'].dt.month_name()
ca_fires_data['fire_day_of_week'] = ca_fires_data['fire_date_cal'].dt.day_name()
ca_fires_data['fire_class'] = ca_fires['FIRE_SIZE_CLASS']
ca_fires_data['lat'] = ca_fires['LATITUDE']
ca_fires_data['lon'] = ca_fires['LONGITUDE']

for i, j in zip(ca_fires_data['lat'].tolist(), ca_fires_data['lon'].tolist()):
    to_mgrs_1m.append(m.toMGRS(i, j, MGRSPrecision=5)) # use mgrs API to convert lat/lon to MGRS conventions
    to_mgrs_10m.append(m.toMGRS(i, j, MGRSPrecision=4)) 
    to_mgrs_100m.append(m.toMGRS(i, j, MGRSPrecision=3)) 
    to_mgrs_1km.append(m.toMGRS(i, j, MGRSPrecision=2)) 
    to_mgrs_10km.append(m.toMGRS(i, j, MGRSPrecision=1)) 
    to_mgrs_100km.append(m.toMGRS(i, j, MGRSPrecision=0)) 

ca_fires_data['mgrs_1m'] = to_mgrs_1m
ca_fires_data['mgrs_10m'] = to_mgrs_10m
ca_fires_data['mgrs_100m'] = to_mgrs_100m
ca_fires_data['mgrs_1km'] = to_mgrs_1km
ca_fires_data['mgrs_10km'] = to_mgrs_10km
ca_fires_data['mgrs_100km'] = to_mgrs_100km

for k in ca_fires_data['mgrs_10km'].tolist():
    to_mgrs_lat_10km.append(m.toLatLon(k)[0])
    to_mgrs_lon_10km.append(m.toLatLon(k)[1])

ca_fires_data['mgrs_lat_10km'] = to_mgrs_lat_10km
ca_fires_data['mgrs_lon_10km'] = to_mgrs_lon_10km

# filter out winter months where fires not likely
months_used = ['April', 'May', 'June', 'July', 'August', 'September', 'October','November'] 
ca_fires_data = ca_fires_data[ca_fires_data.fire_month.isin(months_used)]

# filter out class A fires, which are less than 1/4 acre
classes_used = ['B','C','D','E','F','G']
ca_fires_data = ca_fires_data[ca_fires_data.fire_class.isin(classes_used)]

# filter out fires before 2000 since weather data is sparse before year 2000
ca_fires_data = ca_fires_data[ca_fires_data.fire_year >= 2000]

# limit to only the MGRS locations in Southern CA
mgrs_100km = ['11SMT','11SNT','11SNS','11SMS','11SLT'] # redefine mgrs blocks
socal_fires_data = ca_fires_data[ca_fires_data.mgrs_100km.isin(mgrs_100km)]

# keep only necessary columns
socal_lite = pd.DataFrame()
socal_lite['mgrs_100km'] = socal_fires_data['mgrs_100km']
socal_lite['mgrs_10km'] = socal_fires_data['mgrs_10km']
socal_lite['lat'] = socal_fires_data['mgrs_lat_10km']
socal_lite['lon'] = socal_fires_data['mgrs_lon_10km']
socal_lite['date'] = socal_fires_data['fire_date_cal']
socal_lite = socal_lite.groupby(level=0).first()
# socal_lite.mgrs_10km.unique().shape
socal_lite.reset_index(drop=True, inplace = True)
socal_lite.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8057 entries, 0 to 8056
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   mgrs_100km  8057 non-null   object        
 1   mgrs_10km   8057 non-null   object        
 2   lat         8057 non-null   float64       
 3   lon         8057 non-null   float64       
 4   date        8057 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 314.9+ KB


## Persist socal_lite

In [None]:
# write socal_lite to file
socal_lite.to_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/socal_lite.csv')

# Weather History

## Read Weather History

In [None]:
# read in historic data and convert lat/long mgrs
weather_history = pd.read_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/data/2000to2015_Weather.csv')
weather_history

Unnamed: 0.1,Unnamed: 0,lat,lon,t_mean_2m_24h_F,t_min_2m_24h_F,t_max_2m_24h_F,precip_24h_mm,sunshine_duration_24h_min,drought_index_idx,soil_moisture_index_neg15cm_idx,soil_type_idx,date
0,0,33.705267,-118.079125,53.0,47.6,58.5,0.45,232.7,-4.0,0.0,2.0,2000-01-01 00:00:00+00:00
1,1,33.705267,-118.079125,54.2,49.6,60.6,0.08,572.1,-4.0,0.0,2.0,2000-01-02 00:00:00+00:00
2,2,33.705267,-118.079125,53.2,42.7,66.0,0.00,598.0,-4.0,0.0,2.0,2000-01-03 00:00:00+00:00
3,3,33.705267,-118.079125,55.1,44.2,67.5,0.00,598.9,-4.0,0.0,2.0,2000-01-04 00:00:00+00:00
4,4,33.705267,-118.079125,54.9,44.0,68.9,0.00,598.5,-4.0,0.0,2.0,2000-01-05 00:00:00+00:00
...,...,...,...,...,...,...,...,...,...,...,...,...
2249935,9125,33.167642,-117.536254,54.2,48.3,61.9,0.00,598.0,-1.0,,,2015-12-27 00:00:00+00:00
2249936,9126,33.167642,-117.536254,53.6,49.7,57.2,0.08,212.6,-1.0,,,2015-12-28 00:00:00+00:00
2249937,9127,33.167642,-117.536254,52.6,48.1,56.8,0.07,557.9,-1.0,,,2015-12-29 00:00:00+00:00
2249938,9128,33.167642,-117.536254,53.9,49.7,57.7,0.00,568.3,-2.0,,,2015-12-30 00:00:00+00:00


## Add MGRS for Weather History

In [None]:
m = mgrs.MGRS() #mgrs API
hist_mgrs_1m = [] #precision 5
hist_mgrs_10m = [] #precision 4
hist_mgrs_100m = [] #precision 3
hist_mgrs_1km = []  #precision 2
hist_mgrs_10km = []  #precision 1
hist_mgrs_100km = []  #precision 0
hist_mgrs_lat_10km = []
hist_mgrs_lon_10km = []

for i, j in zip(weather_history['lat'].tolist(), weather_history['lon'].tolist()):
    hist_mgrs_1m.append(m.toMGRS(i, j, MGRSPrecision=5)) # use mgrs API to convert lat/lon to MGRS conventions
    hist_mgrs_10m.append(m.toMGRS(i, j, MGRSPrecision=4)) 
    hist_mgrs_100m.append(m.toMGRS(i, j, MGRSPrecision=3)) 
    hist_mgrs_1km.append(m.toMGRS(i, j, MGRSPrecision=2)) 
    hist_mgrs_10km.append(m.toMGRS(i, j, MGRSPrecision=1)) 
    hist_mgrs_100km.append(m.toMGRS(i, j, MGRSPrecision=0)) 

weather_history['mgrs_1m'] = hist_mgrs_1m
weather_history['mgrs_10m'] = hist_mgrs_10m
weather_history['mgrs_100m'] = hist_mgrs_100m
weather_history['mgrs_1km'] = hist_mgrs_1km
weather_history['mgrs_10km'] = hist_mgrs_10km
weather_history['mgrs_100km'] = hist_mgrs_100km

for k in weather_history['mgrs_10km'].tolist():
    hist_mgrs_lat_10km.append(m.toLatLon(k)[0])
    hist_mgrs_lon_10km.append(m.toLatLon(k)[1])

weather_history['mgrs_lat_10km'] = hist_mgrs_lat_10km
weather_history['mgrs_lon_10km'] = hist_mgrs_lon_10km

In [None]:
weather_history

Unnamed: 0.1,Unnamed: 0,lat,lon,t_mean_2m_24h_F,t_min_2m_24h_F,t_max_2m_24h_F,precip_24h_mm,sunshine_duration_24h_min,drought_index_idx,soil_moisture_index_neg15cm_idx,soil_type_idx,date,mgrs_1m,mgrs_10m,mgrs_100m,mgrs_1km,mgrs_10km,mgrs_100km,mgrs_lat_10km,mgrs_lon_10km
0,0,33.705267,-118.079125,53.0,47.6,58.5,0.45,232.7,-4.0,0.0,2.0,2000-01-01 00:00:00+00:00,11SLT9999929999,11SLT99992999,11SLT999299,11SLT9929,11SLT92,11SLT,33.614103,-118.185780
1,1,33.705267,-118.079125,54.2,49.6,60.6,0.08,572.1,-4.0,0.0,2.0,2000-01-02 00:00:00+00:00,11SLT9999929999,11SLT99992999,11SLT999299,11SLT9929,11SLT92,11SLT,33.614103,-118.185780
2,2,33.705267,-118.079125,53.2,42.7,66.0,0.00,598.0,-4.0,0.0,2.0,2000-01-03 00:00:00+00:00,11SLT9999929999,11SLT99992999,11SLT999299,11SLT9929,11SLT92,11SLT,33.614103,-118.185780
3,3,33.705267,-118.079125,55.1,44.2,67.5,0.00,598.9,-4.0,0.0,2.0,2000-01-04 00:00:00+00:00,11SLT9999929999,11SLT99992999,11SLT999299,11SLT9929,11SLT92,11SLT,33.614103,-118.185780
4,4,33.705267,-118.079125,54.9,44.0,68.9,0.00,598.5,-4.0,0.0,2.0,2000-01-05 00:00:00+00:00,11SLT9999929999,11SLT99992999,11SLT999299,11SLT9929,11SLT92,11SLT,33.614103,-118.185780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2249935,9125,33.167642,-117.536254,54.2,48.3,61.9,0.00,598.0,-1.0,,,2015-12-27 00:00:00+00:00,11SMS5000069999,11SMS50006999,11SMS500699,11SMS5069,11SMS56,11SMS,33.077444,-117.535706
2249936,9126,33.167642,-117.536254,53.6,49.7,57.2,0.08,212.6,-1.0,,,2015-12-28 00:00:00+00:00,11SMS5000069999,11SMS50006999,11SMS500699,11SMS5069,11SMS56,11SMS,33.077444,-117.535706
2249937,9127,33.167642,-117.536254,52.6,48.1,56.8,0.07,557.9,-1.0,,,2015-12-29 00:00:00+00:00,11SMS5000069999,11SMS50006999,11SMS500699,11SMS5069,11SMS56,11SMS,33.077444,-117.535706
2249938,9128,33.167642,-117.536254,53.9,49.7,57.7,0.00,568.3,-2.0,,,2015-12-30 00:00:00+00:00,11SMS5000069999,11SMS50006999,11SMS500699,11SMS5069,11SMS56,11SMS,33.077444,-117.535706


## Persist Weather History


In [None]:
weather_history.to_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/weather_history_mgrs.csv')

# Calendar Data

Get Calendar Data

In [None]:
cal_table = getCalTbl()
cal_table = cal_table[cal_table.month_name.isin(months_used)] # filter out winter months
cal_table['key'] = int(0) # dummy key for cross join later
cal_table.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3904 entries, 91 to 5812
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              3904 non-null   datetime64[ns]
 1   year              3904 non-null   int64         
 2   day_of_year       3904 non-null   int64         
 3   month             3904 non-null   int64         
 4   month_name        3904 non-null   object        
 5   day               3904 non-null   int64         
 6   day_of_week       3904 non-null   int64         
 7   day_of_week_name  3904 non-null   object        
 8   key               3904 non-null   int64         
dtypes: datetime64[ns](1), int64(6), object(2)
memory usage: 305.0+ KB


Get Weather Data and Impute Missing Values

In [None]:
#usgs_weather = pd.read_csv('usgs_weather.csv')
usgs_weather = pd.read_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/usgs_weather.csv')
usgs_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38204 entries, 0 to 38203
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   STATION    38204 non-null  object 
 1   NAME       38204 non-null  object 
 2   LATITUDE   38204 non-null  float64
 3   LONGITUDE  38204 non-null  float64
 4   ELEVATION  38204 non-null  float64
 5   DATE       38204 non-null  object 
 6   PRCP       37863 non-null  float64
 7   TMAX       36571 non-null  float64
 8   TMIN       36570 non-null  float64
 9   TOBS       24269 non-null  float64
dtypes: float64(7), object(3)
memory usage: 2.9+ MB


In [None]:
prcp_temp = pd.DataFrame()
prcp_temp['station_name'] = usgs_weather['NAME']
prcp_temp['lat'] = usgs_weather['LATITUDE']
prcp_temp['lon'] = usgs_weather['LONGITUDE']
prcp_temp['date'] = pd.to_datetime(usgs_weather['DATE'])
prcp_temp['prcp'] = usgs_weather['PRCP']
prcp_temp['tmax'] = usgs_weather['TMAX']
prcp_temp['tmin'] = usgs_weather['TMIN']
prcp_temp = prcp_temp[prcp_temp.date.dt.month_name().isin(months_used)]
prcp_temp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25294 entries, 91 to 38203
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   station_name  25294 non-null  object        
 1   lat           25294 non-null  float64       
 2   lon           25294 non-null  float64       
 3   date          25294 non-null  datetime64[ns]
 4   prcp          25082 non-null  float64       
 5   tmax          24180 non-null  float64       
 6   tmin          24195 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 1.5+ MB


In [None]:
df_hemet = cal_table.merge(prcp_temp[prcp_temp.station_name == 'HEMET, CA US'], on='date', how = 'left')
df_cuymaca = cal_table.merge(prcp_temp[prcp_temp.station_name == 'CUYAMACA, CA US'], on='date', how = 'left')
df_la = cal_table.merge(prcp_temp[prcp_temp.station_name == 'LOS ANGELES DOWNTOWN USC, CA US'], on='date', how = 'left')
df_elsinore = cal_table.merge(prcp_temp[prcp_temp.station_name == 'ELSINORE, CA US'], on='date', how = 'left')
df_vista = cal_table.merge(prcp_temp[prcp_temp.station_name == 'VISTA, CA US'], on='date', how = 'left')
print(df_hemet.shape, df_cuymaca.shape, df_la.shape, df_elsinore.shape, df_vista.shape)

df_hemet['station_name'] = df_hemet['station_name'].fillna('HEMET, CA US')
df_cuymaca['station_name'] = df_cuymaca['station_name'].fillna('CUYAMACA, CA US')
df_la['station_name'] = df_la['station_name'].fillna('LOS ANGELES DOWNTOWN USC, CA US')
df_elsinore['station_name'] = df_elsinore['station_name'].fillna('ELSINORE, CA US')
df_vista['station_name'] = df_vista['station_name'].fillna('VISTA, CA US')

df_hemet['lat'] = df_hemet['lat'].fillna(33.7381)
df_cuymaca['lat'] = df_cuymaca['lat'].fillna(32.9897)
df_la['lat'] = df_la['lat'].fillna(33.686)
df_elsinore['lat'] = df_elsinore['lat'].fillna(34.0236)
df_vista['lat'] = df_vista['lat'].fillna(33.2354)

df_hemet['lon'] = df_hemet['lon'].fillna(-116.8939)
df_cuymaca['lon'] = df_cuymaca['lon'].fillna(-116.5872)
df_la['lon'] = df_la['lon'].fillna(-117.3458)
df_elsinore['lon'] = df_elsinore['lon'].fillna(-118.2911)
df_vista['lon'] = df_vista['lon'].fillna(-117.2322)

data = pd.concat([df_hemet, df_cuymaca, df_elsinore, df_la, df_vista])
data.reset_index(drop=True, inplace=True)
print(data.shape)

(3904, 15) (3904, 15) (3904, 15) (3904, 15) (3904, 15)
(19520, 15)


In [None]:
missingByCol(data)

prcp 587
tmax 1354
tmin 1338


In [None]:
weather_nulls = data[data.isna().any(axis=1)]
weather_nulls

Unnamed: 0,date,year,day_of_year,month,month_name,day,day_of_week,day_of_week_name,key,station_name,lat,lon,prcp,tmax,tmin
494,2002-04-07,2002,97,4,April,7,6,Sunday,0,"HEMET, CA US",33.7381,-116.8939,0.0,73.0,
1433,2005-10-31,2005,304,10,October,31,0,Monday,0,"HEMET, CA US",33.7381,-116.8939,0.0,,48.0
1517,2006-05-24,2006,144,5,May,24,2,Wednesday,0,"HEMET, CA US",33.7381,-116.8939,0.0,94.0,
1524,2006-05-31,2006,151,5,May,31,2,Wednesday,0,"HEMET, CA US",33.7381,-116.8939,0.0,,
2046,2008-07-04,2008,186,7,July,4,4,Friday,0,"HEMET, CA US",33.7381,-116.8939,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18544,2012-04-01,2012,92,4,April,1,6,Sunday,0,"VISTA, CA US",33.2354,-117.2322,,,
18545,2012-04-02,2012,93,4,April,2,0,Monday,0,"VISTA, CA US",33.2354,-117.2322,,,
18546,2012-04-03,2012,94,4,April,3,1,Tuesday,0,"VISTA, CA US",33.2354,-117.2322,,,
18568,2012-04-25,2012,116,4,April,25,2,Wednesday,0,"VISTA, CA US",33.2354,-117.2322,,,


In [None]:
data_missing = data.copy()
data_missing.drop(['date','year','day_of_year','month','month_name','day','day_of_week','day_of_week_name','key','station_name'], axis = 1, inplace = True)
imputer = KNNImputer(n_neighbors=2)
data_imputed = pd.DataFrame(imputer.fit_transform(data_missing), columns = data_missing.columns) # impute missing data here before the merge
missingByCol(data_imputed) # check for missing values

In [None]:
data_imputed.loc[494]

lat      33.7381
lon    -116.8939
prcp      0.0000
tmax     73.0000
tmin     44.5000
Name: 494, dtype: float64

In [None]:
print(len(data_imputed))

data.drop(['lat','lon','prcp','tmax','tmin'], axis = 1, inplace = True)
data['lat'] = data_imputed['lat']
data['lon'] = data_imputed['lon']
data['prcp'] = data_imputed['prcp']
data['tmax'] = data_imputed['tmax']
data['tmin'] = data_imputed['tmin']
missingByCol(data) # check for missing values

19520


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19520 entries, 0 to 19519
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              19520 non-null  datetime64[ns]
 1   year              19520 non-null  int64         
 2   day_of_year       19520 non-null  int64         
 3   month             19520 non-null  int64         
 4   month_name        19520 non-null  object        
 5   day               19520 non-null  int64         
 6   day_of_week       19520 non-null  int64         
 7   day_of_week_name  19520 non-null  object        
 8   key               19520 non-null  int64         
 9   station_name      19520 non-null  object        
 10  lat               19520 non-null  float64       
 11  lon               19520 non-null  float64       
 12  prcp              19520 non-null  float64       
 13  tmax              19520 non-null  float64       
 14  tmin              1952

In [None]:
data

Unnamed: 0,date,year,day_of_year,month,month_name,day,day_of_week,day_of_week_name,key,station_name,lat,lon,prcp,tmax,tmin
0,2000-04-01,2000,92,4,April,1,5,Saturday,0,"HEMET, CA US",33.7381,-116.8939,0.00,80.0,44.0
1,2000-04-02,2000,93,4,April,2,6,Sunday,0,"HEMET, CA US",33.7381,-116.8939,0.00,82.0,44.0
2,2000-04-03,2000,94,4,April,3,0,Monday,0,"HEMET, CA US",33.7381,-116.8939,0.00,87.0,43.0
3,2000-04-04,2000,95,4,April,4,1,Tuesday,0,"HEMET, CA US",33.7381,-116.8939,0.00,91.0,41.0
4,2000-04-05,2000,96,4,April,5,2,Wednesday,0,"HEMET, CA US",33.7381,-116.8939,0.00,86.0,45.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19515,2015-11-26,2015,330,11,November,26,3,Thursday,0,"VISTA, CA US",33.2354,-117.2322,0.00,62.0,45.0
19516,2015-11-27,2015,331,11,November,27,4,Friday,0,"VISTA, CA US",33.2354,-117.2322,0.51,61.0,46.0
19517,2015-11-28,2015,332,11,November,28,5,Saturday,0,"VISTA, CA US",33.2354,-117.2322,0.00,65.0,40.0
19518,2015-11-29,2015,333,11,November,29,6,Sunday,0,"VISTA, CA US",33.2354,-117.2322,0.00,67.0,44.0


In [None]:
# rolling averages for prcp, tmax, tmin for 3, 7, and 10 days

#prcp
data['prcp_rolling_3day'] = data.groupby('station_name')['prcp'].transform(lambda x: x.rolling(3).mean())
data['prcp_rolling_3day'] = round(data['prcp_rolling_3day'],6)

data['prcp_rolling_7day'] = data.groupby('station_name')['prcp'].transform(lambda x: x.rolling(7).mean())
data['prcp_rolling_7day'] = round(data['prcp_rolling_7day'],6)

data['prcp_rolling_14day'] = data.groupby('station_name')['prcp'].transform(lambda x: x.rolling(14).mean())
data['prcp_rolling_14day'] = round(data['prcp_rolling_14day'],6)

#tmax
data['tmax_rolling_3day'] = data.groupby('station_name')['tmax'].transform(lambda x: x.rolling(3).mean())
data['tmax_rolling_3day'] = round(data['tmax_rolling_3day'],6)

data['tmax_rolling_7day'] = data.groupby('station_name')['tmax'].transform(lambda x: x.rolling(7).mean())
data['tmax_rolling_7day'] = round(data['tmax_rolling_7day'],6)

data['tmax_rolling_14day'] = data.groupby('station_name')['tmax'].transform(lambda x: x.rolling(14).mean())
data['tmax_rolling_14day'] = round(data['tmax_rolling_14day'],6)

#tmin
data['tmin_rolling_3day'] = data.groupby('station_name')['tmin'].transform(lambda x: x.rolling(3).mean())
data['tmin_rolling_3day'] = round(data['tmin_rolling_3day'],6)

data['tmin_rolling_7day'] = data.groupby('station_name')['tmin'].transform(lambda x: x.rolling(7).mean())
data['tmin_rolling_7day'] = round(data['tmin_rolling_7day'],6)

data['tmin_rolling_14day'] = data.groupby('station_name')['tmin'].transform(lambda x: x.rolling(14).mean())
data['tmin_rolling_14day'] = round(data['tmin_rolling_14day'],6)

In [None]:
missingByCol(data)

prcp_rolling_3day 10
prcp_rolling_7day 30
prcp_rolling_14day 65
tmax_rolling_3day 10
tmax_rolling_7day 30
tmax_rolling_14day 65
tmin_rolling_3day 10
tmin_rolling_7day 30
tmin_rolling_14day 65


In [None]:
data_missing2 = data.copy()
data_missing2.drop(['date','year','day_of_year','month','month_name','day','day_of_week','day_of_week_name','key','station_name'], axis = 1, inplace = True)
imputer2 = KNNImputer(n_neighbors=2)
data_imputed2 = pd.DataFrame(imputer2.fit_transform(data_missing2), columns = data_missing2.columns) # impute missing data here before the merge
missingByCol(data_imputed2) # check for missing values

In [None]:
data_imputed2

Unnamed: 0,lat,lon,prcp,tmax,tmin,prcp_rolling_3day,prcp_rolling_7day,prcp_rolling_14day,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day,tmin_rolling_3day,tmin_rolling_7day,tmin_rolling_14day
0,33.7381,-116.8939,0.00,80.0,44.0,0.006666,0.002857,0.017857,77.500000,75.500000,78.178572,43.333333,43.500000,45.142858
1,33.7381,-116.8939,0.00,82.0,44.0,0.000000,0.039286,0.022857,81.000000,76.285714,72.428571,43.500000,43.000000,44.607143
2,33.7381,-116.8939,0.00,87.0,43.0,0.000000,0.000000,0.011428,83.000000,78.214286,74.821428,43.666667,44.071428,43.464286
3,33.7381,-116.8939,0.00,91.0,41.0,0.000000,0.000000,0.001429,86.666667,79.785714,82.071429,42.666667,43.857143,44.928572
4,33.7381,-116.8939,0.00,86.0,45.0,0.000000,0.000000,0.000000,88.000000,85.214286,79.571428,43.000000,42.500000,42.107142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19515,33.2354,-117.2322,0.00,62.0,45.0,0.036667,0.015714,0.010714,64.333333,75.285714,74.357143,47.333333,50.714286,48.428571
19516,33.2354,-117.2322,0.51,61.0,46.0,0.206667,0.088571,0.047143,61.333333,72.571429,73.214286,47.000000,50.142857,48.285714
19517,33.2354,-117.2322,0.00,65.0,40.0,0.170000,0.088571,0.047143,62.666667,69.285714,72.428571,43.666667,48.000000,47.857143
19518,33.2354,-117.2322,0.00,67.0,44.0,0.170000,0.088571,0.047143,64.333333,66.571429,72.285714,43.333333,46.000000,47.500000


In [None]:
print(len(data_imputed2))

data.drop(['prcp_rolling_3day','prcp_rolling_7day','prcp_rolling_14day','tmax_rolling_3day','tmax_rolling_7day','tmax_rolling_14day','tmin_rolling_3day','tmin_rolling_7day','tmin_rolling_14day'], axis = 1, inplace = True)
data['prcp_rolling_3day'] = data_imputed2['prcp_rolling_3day']
data['prcp_rolling_7day'] = data_imputed2['prcp_rolling_7day']
data['prcp_rolling_14day'] = data_imputed2['prcp_rolling_14day']
data['tmax_rolling_3day'] = data_imputed2['tmax_rolling_3day']
data['tmax_rolling_7day'] = data_imputed2['tmax_rolling_7day']
data['tmax_rolling_14day'] = data_imputed2['tmax_rolling_14day']
data['tmin_rolling_3day'] = data_imputed2['tmin_rolling_3day']
data['tmin_rolling_7day'] = data_imputed2['tmin_rolling_7day']
data['tmin_rolling_14day'] = data_imputed2['tmin_rolling_14day']
missingByCol(data) # check for missing values

19520


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stabl

In [None]:
to_mgrs_10km, to_mgrs_100km = [],[] # get MGRS for each weather feed (5, 1 for each 100 km block)
w_lat = data['lat'].tolist()
w_lon = data['lon'].tolist()
for i, j in zip(w_lat, w_lon):
    to_mgrs_10km.append(m.toMGRS(i, j, MGRSPrecision=1)) 
    to_mgrs_100km.append(m.toMGRS(i, j, MGRSPrecision=0)) 
data['mgrs_100km'] = to_mgrs_100km # 
data['mgrs_10km'] = to_mgrs_10km

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19520 entries, 0 to 19519
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   date                19520 non-null  datetime64[ns]
 1   day_of_year         19520 non-null  int64         
 2   month_name          19520 non-null  object        
 3   day                 19520 non-null  int64         
 4   day_of_week         19520 non-null  int64         
 5   day_of_week_name    19520 non-null  object        
 6   station_name        19520 non-null  object        
 7   mgrs_100km          19520 non-null  object        
 8   lat                 19520 non-null  float64       
 9   lon                 19520 non-null  float64       
 10  prcp                19520 non-null  float64       
 11  prcp_rolling_3day   19520 non-null  float64       
 12  prcp_rolling_7day   19520 non-null  float64       
 13  prcp_rolling_14day  19520 non-null  float64   

In [None]:
data.head()

Unnamed: 0,date,day_of_year,month_name,day,day_of_week,day_of_week_name,station_name,mgrs_100km,lat,lon,prcp,prcp_rolling_3day,prcp_rolling_7day,prcp_rolling_14day,tmin,tmin_rolling_3day,tmin_rolling_7day,tmin_rolling_14day,tmax,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day,mgrs_10km
0,2000-04-01,92,April,1,5,Saturday,"HEMET, CA US",11SNT,33.7381,-116.8939,0.0,0.006666,0.002857,0.017857,44.0,43.333333,43.5,45.142858,80.0,77.5,75.5,78.178572,11SNT03
1,2000-04-02,93,April,2,6,Sunday,"HEMET, CA US",11SNT,33.7381,-116.8939,0.0,0.0,0.039286,0.022857,44.0,43.5,43.0,44.607143,82.0,81.0,76.285714,72.428571,11SNT03
2,2000-04-03,94,April,3,0,Monday,"HEMET, CA US",11SNT,33.7381,-116.8939,0.0,0.0,0.0,0.011428,43.0,43.666667,44.071428,43.464286,87.0,83.0,78.214286,74.821428,11SNT03
3,2000-04-04,95,April,4,1,Tuesday,"HEMET, CA US",11SNT,33.7381,-116.8939,0.0,0.0,0.0,0.001429,41.0,42.666667,43.857143,44.928572,91.0,86.666667,79.785714,82.071429,11SNT03
4,2000-04-05,96,April,5,2,Wednesday,"HEMET, CA US",11SNT,33.7381,-116.8939,0.0,0.0,0.0,0.0,45.0,43.0,42.5,42.107142,86.0,88.0,85.214286,79.571428,11SNT03


In [None]:
#data = data[['date', 'day_of_year' ,'month_name' ,'day' ,'day_of_week' ,'day_of_week_name' ,'station_name', 'mgrs_100km' ,'lat' ,'lon' ,'prcp' ,'prcp_rolling_3day' ,'prcp_rolling_7day' ,'prcp_rolling_14day' ,'tmin' ,'tmin_rolling_3day','tmin_rolling_7day','tmin_rolling_14day','tmax','tmax_rolling_3day','tmax_rolling_7day','tmax_rolling_14day']]

# jrw
data = data[['date', 'day_of_year' ,'month_name' ,'day' ,'day_of_week' ,'day_of_week_name' ,'station_name', 'mgrs_100km', 'mgrs_10km', 'lat' ,'lon' ,'prcp' ,'prcp_rolling_3day' ,'prcp_rolling_7day' ,'prcp_rolling_14day' ,'tmin' ,'tmin_rolling_3day','tmin_rolling_7day','tmin_rolling_14day','tmax','tmax_rolling_3day','tmax_rolling_7day','tmax_rolling_14day']]

# write out data to csv before dropping columns
data.to_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/weather_data_socal_mgrs.csv')

# Resume Here

In [5]:
# read data from csv to pick up where left off

# data df
data = pd.read_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/weather_data_socal_mgrs.csv')

# socal_lite
socal_lite = pd.read_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/socal_lite.csv')

# weather_history_mgrs
weather_history_mgrs = pd.read_csv('/content/gdrive/MyDrive/SMU/msds_6120_capstone_a/project/src/git_repos/weather_history_mgrs.csv')

In [None]:
data

Unnamed: 0.1,Unnamed: 0,date,day_of_year,month_name,day,day_of_week,day_of_week_name,station_name,mgrs_100km,mgrs_10km,lat,lon,prcp,prcp_rolling_3day,prcp_rolling_7day,prcp_rolling_14day,tmin,tmin_rolling_3day,tmin_rolling_7day,tmin_rolling_14day,tmax,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day
0,0,2000-04-01,92,April,1,5,Saturday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.006667,0.002857,0.017857,44.0,43.333334,43.500000,45.142858,80.0,77.500000,75.500000,78.178571
1,1,2000-04-02,93,April,2,6,Sunday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.039286,0.022857,44.0,43.500000,43.000000,44.607143,82.0,81.000000,76.285714,72.428571
2,2,2000-04-03,94,April,3,0,Monday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.000000,0.011428,43.0,43.666667,44.071429,43.464285,87.0,83.000000,78.214286,74.821429
3,3,2000-04-04,95,April,4,1,Tuesday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.000000,0.001429,41.0,42.666667,43.857143,44.928571,91.0,86.666667,79.785714,82.071429
4,4,2000-04-05,96,April,5,2,Wednesday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.000000,0.000000,45.0,43.000000,42.500000,42.107143,86.0,88.000000,85.214286,79.571429
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19515,19515,2015-11-26,330,November,26,3,Thursday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.00,0.036667,0.015714,0.010714,45.0,47.333333,50.714286,48.428571,62.0,64.333333,75.285714,74.357143
19516,19516,2015-11-27,331,November,27,4,Friday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.51,0.206667,0.088571,0.047143,46.0,47.000000,50.142857,48.285714,61.0,61.333333,72.571429,73.214286
19517,19517,2015-11-28,332,November,28,5,Saturday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.00,0.170000,0.088571,0.047143,40.0,43.666667,48.000000,47.857143,65.0,62.666667,69.285714,72.428571
19518,19518,2015-11-29,333,November,29,6,Sunday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.00,0.170000,0.088571,0.047143,44.0,43.333333,46.000000,47.500000,67.0,64.333333,66.571429,72.285714


In [None]:
# data.to_csv('weather_data.csv')
socal_lite

Unnamed: 0.1,Unnamed: 0,mgrs_100km,mgrs_10km,lat,lon,date
0,0,11SMT,11SMT27,34.067686,-117.866966,2005-07-03
1,1,11SMT,11SMT53,33.708802,-117.539594,2005-06-19
2,2,11SNS,11SNS16,33.078549,-116.892857,2005-07-31
3,3,11SNS,11SNS23,32.807799,-116.786366,2005-07-04
4,4,11SNS,11SNS08,33.258997,-117.000000,2005-07-05
...,...,...,...,...,...,...
8052,8052,11SNS,11SNS12,32.717728,-116.893290,2015-04-09
8053,8053,11SMT,11SMT95,33.890318,-117.108147,2009-05-03
8054,8054,11SNS,11SNS04,32.898187,-117.000000,2010-05-29
8055,8055,11SNS,11SNS04,32.898187,-117.000000,2014-05-29


In [None]:
data2 = data.merge(socal_lite, on = ('date', 'mgrs_100km'), how = 'left')

In [None]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22320 entries, 0 to 22319
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0_x        22320 non-null  int64  
 1   date                22320 non-null  object 
 2   day_of_year         22320 non-null  int64  
 3   month_name          22320 non-null  object 
 4   day                 22320 non-null  int64  
 5   day_of_week         22320 non-null  int64  
 6   day_of_week_name    22320 non-null  object 
 7   station_name        22320 non-null  object 
 8   mgrs_100km          22320 non-null  object 
 9   mgrs_10km_x         22320 non-null  object 
 10  lat_x               22320 non-null  float64
 11  lon_x               22320 non-null  float64
 12  prcp                22320 non-null  float64
 13  prcp_rolling_3day   22320 non-null  float64
 14  prcp_rolling_7day   22320 non-null  float64
 15  prcp_rolling_14day  22320 non-null  float64
 16  tmin

In [None]:
data2

Unnamed: 0,Unnamed: 0_x,date,day_of_year,month_name,day,day_of_week,day_of_week_name,station_name,mgrs_100km,mgrs_10km_x,lat_x,lon_x,prcp,prcp_rolling_3day,prcp_rolling_7day,prcp_rolling_14day,tmin,tmin_rolling_3day,tmin_rolling_7day,tmin_rolling_14day,tmax,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day,Unnamed: 0_y,mgrs_10km_y,lat_y,lon_y
0,0,2000-04-01,92,April,1,5,Saturday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.006667,0.002857,0.017857,44.0,43.333334,43.500000,45.142858,80.0,77.500000,75.500000,78.178571,2698.0,11SNT72,33.617485,-116.24537
1,1,2000-04-02,93,April,2,6,Sunday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.039286,0.022857,44.0,43.500000,43.000000,44.607143,82.0,81.000000,76.285714,72.428571,,,,
2,2,2000-04-03,94,April,3,0,Monday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.000000,0.011428,43.0,43.666667,44.071429,43.464285,87.0,83.000000,78.214286,74.821429,,,,
3,3,2000-04-04,95,April,4,1,Tuesday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.000000,0.001429,41.0,42.666667,43.857143,44.928571,91.0,86.666667,79.785714,82.071429,,,,
4,4,2000-04-05,96,April,5,2,Wednesday,"HEMET, CA US",11SNT,11SNT03,33.7381,-116.8939,0.00,0.000000,0.000000,0.000000,45.0,43.000000,42.500000,42.107143,86.0,88.000000,85.214286,79.571429,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22315,19515,2015-11-26,330,November,26,3,Thursday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.00,0.036667,0.015714,0.010714,45.0,47.333333,50.714286,48.428571,62.0,64.333333,75.285714,74.357143,,,,
22316,19516,2015-11-27,331,November,27,4,Friday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.51,0.206667,0.088571,0.047143,46.0,47.000000,50.142857,48.285714,61.0,61.333333,72.571429,73.214286,,,,
22317,19517,2015-11-28,332,November,28,5,Saturday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.00,0.170000,0.088571,0.047143,40.0,43.666667,48.000000,47.857143,65.0,62.666667,69.285714,72.428571,,,,
22318,19518,2015-11-29,333,November,29,6,Sunday,"VISTA, CA US",11SMS,11SMS77,33.2354,-117.2322,0.00,0.170000,0.088571,0.047143,44.0,43.333333,46.000000,47.500000,67.0,64.333333,66.571429,72.285714,,,,


In [None]:
#data2.drop(['date', 'station_name', 'mgrs_100km', 'lat_x', 'lon_x', 'lat_y', 'lon_y'], axis = 1, inplace = True)

# rename the columns to persist the fire locations
data2.rename(columns = {'mgrs_10km_y': 'mgrs_10km_fire', 'lat_y': 'lat_fire', 'lon_y':'lon_fire', 'Unnamed: 0_y':'index_fire'}, inplace = True)

data2.drop(['Unnamed: 0_x', 'station_name', 'mgrs_100km', 'lat_x', 'lon_x'], axis = 1, inplace = True)


In [None]:
data2

Unnamed: 0,date,day_of_year,month_name,day,day_of_week,day_of_week_name,mgrs_10km_x,prcp,prcp_rolling_3day,prcp_rolling_7day,prcp_rolling_14day,tmin,tmin_rolling_3day,tmin_rolling_7day,tmin_rolling_14day,tmax,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day,index_fire,mgrs_10km_fire,lat_fire,lon_fire
0,2000-04-01,92,April,1,5,Saturday,11SNT03,0.00,0.006667,0.002857,0.017857,44.0,43.333334,43.500000,45.142858,80.0,77.500000,75.500000,78.178571,2698.0,11SNT72,33.617485,-116.24537
1,2000-04-02,93,April,2,6,Sunday,11SNT03,0.00,0.000000,0.039286,0.022857,44.0,43.500000,43.000000,44.607143,82.0,81.000000,76.285714,72.428571,,,,
2,2000-04-03,94,April,3,0,Monday,11SNT03,0.00,0.000000,0.000000,0.011428,43.0,43.666667,44.071429,43.464285,87.0,83.000000,78.214286,74.821429,,,,
3,2000-04-04,95,April,4,1,Tuesday,11SNT03,0.00,0.000000,0.000000,0.001429,41.0,42.666667,43.857143,44.928571,91.0,86.666667,79.785714,82.071429,,,,
4,2000-04-05,96,April,5,2,Wednesday,11SNT03,0.00,0.000000,0.000000,0.000000,45.0,43.000000,42.500000,42.107143,86.0,88.000000,85.214286,79.571429,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22315,2015-11-26,330,November,26,3,Thursday,11SMS77,0.00,0.036667,0.015714,0.010714,45.0,47.333333,50.714286,48.428571,62.0,64.333333,75.285714,74.357143,,,,
22316,2015-11-27,331,November,27,4,Friday,11SMS77,0.51,0.206667,0.088571,0.047143,46.0,47.000000,50.142857,48.285714,61.0,61.333333,72.571429,73.214286,,,,
22317,2015-11-28,332,November,28,5,Saturday,11SMS77,0.00,0.170000,0.088571,0.047143,40.0,43.666667,48.000000,47.857143,65.0,62.666667,69.285714,72.428571,,,,
22318,2015-11-29,333,November,29,6,Sunday,11SMS77,0.00,0.170000,0.088571,0.047143,44.0,43.333333,46.000000,47.500000,67.0,64.333333,66.571429,72.285714,,,,


In [None]:
~data2['mgrs_10km_fire'].isna()

0         True
1        False
2        False
3        False
4        False
         ...  
22315    False
22316    False
22317    False
22318    False
22319    False
Name: mgrs_10km_fire, Length: 22320, dtype: bool

In [None]:
data2['is_fire_bool'] = ~data2['mgrs_10km_fire'].isna() # set NAs to 0
is_fire = []
for i in data2['is_fire_bool']:
    if i == False:
        is_fire.append(int(0))
    else:
        is_fire.append(int(1))

data2['is_fire'] = is_fire
#data2.drop(['mgrs_10km','is_fire_bool'], axis = 1, inplace = True) # drop more columns

In [None]:
data2.info

<bound method DataFrame.info of              date  day_of_year month_name  ...   lon_fire  is_fire_bool is_fire
0      2000-04-01           92      April  ... -116.24537          True       1
1      2000-04-02           93      April  ...        NaN         False       0
2      2000-04-03           94      April  ...        NaN         False       0
3      2000-04-04           95      April  ...        NaN         False       0
4      2000-04-05           96      April  ...        NaN         False       0
...           ...          ...        ...  ...        ...           ...     ...
22315  2015-11-26          330   November  ...        NaN         False       0
22316  2015-11-27          331   November  ...        NaN         False       0
22317  2015-11-28          332   November  ...        NaN         False       0
22318  2015-11-29          333   November  ...        NaN         False       0
22319  2015-11-30          334   November  ...        NaN         False       0

[22320 

In [None]:
data2[data2['is_fire'] == 1] #why missing fires?

Unnamed: 0,date,day_of_year,month_name,day,day_of_week,day_of_week_name,mgrs_10km_x,prcp,prcp_rolling_3day,prcp_rolling_7day,prcp_rolling_14day,tmin,tmin_rolling_3day,tmin_rolling_7day,tmin_rolling_14day,tmax,tmax_rolling_3day,tmax_rolling_7day,tmax_rolling_14day,index_fire,mgrs_10km_fire,lat_fire,lon_fire,is_fire_bool,is_fire
0,2000-04-01,92,April,1,5,Saturday,11SNT03,0.0,0.006667,0.002857,0.017857,44.0,43.333334,43.500000,45.142858,80.0,77.500000,75.500000,78.178571,2698.0,11SNT72,33.617485,-116.245370,True,1
24,2000-04-25,116,April,25,1,Tuesday,11SNT03,0.0,0.000000,0.007143,0.042857,41.0,43.333333,42.571429,42.928571,94.0,84.666667,80.571429,78.571429,2717.0,11SNT81,33.526595,-116.138473,True,1
27,2000-04-28,119,April,28,4,Friday,11SNT03,0.0,0.000000,0.000000,0.042857,47.0,49.666667,46.428571,43.857143,90.0,94.333333,86.428571,80.285714,2700.0,11SNT81,33.526595,-116.138473,True,1
28,2000-04-29,120,April,29,5,Saturday,11SNT03,0.0,0.000000,0.000000,0.042857,43.0,46.666667,46.000000,43.642857,84.0,90.000000,88.714286,81.285714,2701.0,11SNT81,33.526595,-116.138473,True,1
34,2000-05-05,126,May,5,4,Friday,11SNT03,0.0,0.000000,0.000000,0.000000,51.0,48.666667,46.571429,46.500000,89.0,90.666667,91.857143,89.142857,652.0,11SNT35,33.889938,-116.675560,True,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22209,2015-08-13,225,August,13,3,Thursday,11SMS77,0.0,0.000000,0.000000,0.000000,64.0,62.666667,63.857143,64.785714,86.0,81.666667,78.428571,80.428571,7542.0,11SMS59,33.348034,-117.537358,True,1
22219,2015-08-23,235,August,23,6,Sunday,11SMS77,0.0,0.000000,0.000000,0.000000,68.0,67.666667,67.142857,65.357143,79.0,78.333333,79.142857,82.071429,7552.0,11SMS87,33.168612,-117.214505,True,1
22223,2015-08-27,239,August,27,3,Thursday,11SMS77,0.0,0.000000,0.000000,0.000000,66.0,67.000000,67.142857,66.642857,92.0,87.333333,82.571429,83.642857,7973.0,11SMS77,33.168381,-117.321757,True,1
22260,2015-10-03,276,October,3,5,Saturday,11SMS77,0.0,0.000000,0.000000,0.000000,60.0,60.666667,60.428571,63.285714,77.0,79.333333,82.142857,84.857143,7606.0,11SMS87,33.168612,-117.214505,True,1


In [None]:
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22320 entries, 0 to 22319
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0_x        22320 non-null  int64  
 1   date                22320 non-null  object 
 2   day_of_year         22320 non-null  int64  
 3   month_name          22320 non-null  object 
 4   day                 22320 non-null  int64  
 5   day_of_week         22320 non-null  int64  
 6   day_of_week_name    22320 non-null  object 
 7   station_name        22320 non-null  object 
 8   mgrs_100km          22320 non-null  object 
 9   mgrs_10km_x         22320 non-null  object 
 10  lat_x               22320 non-null  float64
 11  lon_x               22320 non-null  float64
 12  prcp                22320 non-null  float64
 13  prcp_rolling_3day   22320 non-null  float64
 14  prcp_rolling_7day   22320 non-null  float64
 15  prcp_rolling_14day  22320 non-null  float64
 16  tmin

Build Model

In [None]:
X = data2.copy()
# X

month_dummy = pd.get_dummies(X['month_name'])
wkname_dummy = pd.get_dummies(X['day_of_week_name'])
# mgrs_dummy = pd.get_dummies(X['mgrs_10km'])

X = pd.concat((X, month_dummy), axis = 1)
X = pd.concat((X, wkname_dummy), axis = 1)
# X = pd.concat((X, mgrs_dummy), axis = 1)

y = X['is_fire']

# X.drop(['month_name','day_of_week_name','mgrs_10km','is_fire'], axis = 1, inplace = True) # drop more columns
X.drop(['month_name','day_of_week_name','is_fire'], axis = 1, inplace = True) # drop more columns
print(X.shape, y.shape)

(22320, 30) (22320,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 8)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(17856, 30) (4464, 30) (17856,) (4464,)


In [None]:
clfsList = [LogisticRegression, RandomForestClassifier, GradientBoostingClassifier]

clf_hyper = { 
         'LogisticRegression':{
            'tol': [0.01, 0.1, 1.0]
           ,'C': [0.01, 0.1, 1.0]
           , 'solver': ['sag','saga']
           , 'class_weight': [None,'balanced']
        }
        ,
         'RandomForestClassifier':{
            'n_estimators': [50,100,150]
           ,'min_samples_leaf': [1,5,15]
           ,'class_weight': [None, 'balanced']
           ,'bootstrap': [True]
        }        
        ,
        'GradientBoostingClassifier':{
            'learning_rate': [0.001,0.01,0.1]
           ,'n_estimators': [10,50,100]
           ,'min_samples_leaf': [5,3,1]
        }
}

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [None]:
clfGridSearch(np.array(X_train), np.array(y_train), 5)


Classifier with Parameters: RandomForestClassifier(n_estimators=150) 
MeanAccuracy 0.8138445885622736

Classifier with Parameters: RandomForestClassifier(class_weight='balanced') 
MeanAccuracy 0.813732245853825

Classifier with Parameters: RandomForestClassifier(class_weight='balanced', n_estimators=150) 
MeanAccuracy 0.8135086109549271

Classifier with Parameters: RandomForestClassifier() 
MeanAccuracy 0.8116602950920739

Classifier with Parameters: RandomForestClassifier(class_weight='balanced', n_estimators=50) 
MeanAccuracy 0.8106523779494077

Classifier with Parameters: RandomForestClassifier(n_estimators=50) 
MeanAccuracy 0.8090840329730945

Classifier with Parameters: RandomForestClassifier(min_samples_leaf=5) 
MeanAccuracy 0.7950829799463953

Classifier with Parameters: RandomForestClassifier(min_samples_leaf=5, n_estimators=150) 
MeanAccuracy 0.7949710919397673

Classifier with Parameters: RandomForestClassifier(min_samples_leaf=5, n_estimators=50) 
MeanAccuracy 0.79205897764

In [None]:
clf_1 = RandomForestClassifier(n_estimators=150)
clfResults(clf_1, X_train,y_train,X_test,y_test)

-----------------------Confusion Matrix------------------------ 
  Predicted|--------------------Actual------------------------ 
           |        Yes            |         No                |   Total    
-------------------------------------------------------------- 
        Yes|        1103            |        288               |    1391 
-------------------------------------------------------------- 
         No|        498             |        2575              |    3073 
-------------------------------------------------------------- 
      Total|        1601            |        2863              |    4464 

Accuracy:  0.823925 
Precision:  0.792955 
Recall:  0.688944


In [None]:
clfResults(clf, X_train, y_train, X_test, y_test)

-----------------------Confusion Matrix------------------------ 
  Predicted|--------------------Actual------------------------ 
           |        Yes            |         No                |   Total    
-------------------------------------------------------------- 
        Yes|        1095            |        297               |    1392 
-------------------------------------------------------------- 
         No|        506             |        2566              |    3072 
-------------------------------------------------------------- 
      Total|        1601            |        2863              |    4464 

Accuracy:  0.820116 
Precision:  0.786638 
Recall:  0.683948


In [None]:
importance1 = [clf_1.feature_importances_]

clf_1_importance = pd.DataFrame(importance1, columns = X_train.columns).T

clf_1_importance.columns = ['coeff']

clf_1_importance.sort_values(by = 'coeff', ascending = False)

Unnamed: 0,coeff
tmax_rolling_7day,0.11245
tmax_rolling_3day,0.107566
tmax_rolling_14day,0.103076
tmax,0.086948
tmin_rolling_14day,0.082719
day_of_year,0.080326
tmin_rolling_7day,0.074204
tmin_rolling_3day,0.067523
tmin,0.058982
day,0.054362
