#PRELIMINARIES

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#Preliminaries:
url_train = "https://github.com/lakigigar/Caltech-CS155-2021/blob/main/projects/project1/WILDFIRES_TRAIN.zip?raw=TRUE"
url_test = "https://github.com/lakigigar/Caltech-CS155-2021/blob/main/projects/project1/WILDFIRES_TEST.zip?raw=TRUE"
df_train = pd.read_csv(url_train, compression="zip", index_col="id")
df_test = pd.read_csv(url_test, compression = "zip", index_col="id")

In [2]:
#Peek at data
df_train

Unnamed: 0_level_0,LATITUDE,LONGITUDE,STATE,DISCOVERY_TIME,FIRE_SIZE,FIPS_NAME,FIPS_CODE,SOURCE_REPORTING_UNIT_NAME,DATE,LABEL
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,38.205000,-120.335000,CA,130.0,0.10,,,Stanislaus National Forest,1992-01-01,1
1,33.813100,-85.104300,GA,1115.0,1.17,Haralson,143.0,Georgia Forestry Commission,1992-01-01,4
2,32.201000,-82.498700,GA,1600.0,0.07,Montgomery,209.0,Georgia Forestry Commission,1992-01-01,2
3,32.509300,-81.708600,GA,1215.0,4.40,Bulloch,31.0,Georgia Forestry Commission,1992-01-01,4
4,33.663889,-116.171944,CA,,0.20,,,CDF - Riverside Unit,1992-01-01,2
...,...,...,...,...,...,...,...,...,...,...
285377,32.359105,-82.893909,GA,,0.25,Laurens,175.0,"GAS Ogeechee District, McRae Office",2009-12-30,4
285378,38.346342,-120.855472,CA,1031.0,0.10,,,Amador-El Dorado Unit,2009-12-30,2
285379,32.780596,-82.742433,GA,,0.01,Johnson,167.0,"GAS Oconee District, Milledgeville Office",2009-12-30,2
285380,33.985000,-116.915000,CA,,0.30,,,CDF - Riverside Unit,2009-12-30,2


#DATA: FORMATTING

In [3]:
''' HEADS UP: I have a starred-out part in each code block with defined measures
 from previous testing, as well as an empty selection below if you want to 
 experiment.'''

#Data has a lot of NaNs and three features of categorical data, which we must first transform
#Anything we do to the training data, we must also do to the testing data

#Let's build a dataframe of the input data we already have in usable form
input_data = df_train[['LATITUDE', "LONGITUDE","FIRE_SIZE", "LABEL"]].copy()
test_data = df_test[['LATITUDE', 'LONGITUDE', 'FIRE_SIZE']].copy()

#********************** KNOWN DATA EDITS ***************************************
#Date column:
#For dates, the most information will likely come from the day of the year, which
#should capture monthly and seasonal data in a continuous spread.
from datetime import datetime
day_of_year=df_train['DATE'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').timetuple().tm_yday)
input_data["DAY_OF_YR"] = day_of_year

day_test = df_test['DATE'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').timetuple().tm_yday)
test_data['DAY_OF_YR'] = day_test

#State Column:
#The only states in the data set are GA/CA, so we can one-hot these
states = pd.get_dummies(df_train['STATE'])
input_data[["CA", "GA"]]=states

states_test = pd.get_dummies(df_test['STATE'])
test_data[["CA", "GA"]] = states_test


#For the name columns, there are about ~200 unique features for each, we should label-encode these
#This has an added advantage from turning NaNs into another category, rather than excising NaN entries
#NOTICE: FIPS_NAME and FIPS_CODE are perfectly correlated (the code gives the id number of name),
# So there is only need to use one of these two columns. Let's use FIP_CODE.
SOURCE_REPORTING_UNIT_NAME, FIPS_CODE Columns:
fipcdlbl = df_train["FIPS_CODE"].astype('category').cat.codes
srcnamelbl = df_train["SOURCE_REPORTING_UNIT_NAME"].astype('category').cat.codes
input_data["FIPCOD_CAT"] = fipcdlbl
input_data["SRCNAM_CAT"] = srcnamelbl

fipcdlbl_test = df_test["FIPS_CODE"].astype('category').cat.codes
srcnamelbl_test = df_test["SOURCE_REPORTING_UNIT_NAME"].astype('category').cat.codes
test_data["FIPCOD_CAT" ] = fipcdlbl_test
test_data["SRCNAM_CAT"] = srcnamelbl_test

#The only remaining NaNs in the dataset are in the discovery time column
#Let's normalize this column and give the NaNs a -1:
times = df_train["DISCOVERY_TIME"].to_numpy()
goodtimes = times[~np.isnan(times)]
goodtimes = (goodtimes-np.average(goodtimes))/np.std(goodtimes)
times[~np.isnan(times)]=goodtimes
times[np.isnan(times)]=-1
input_data["SCALED_TIME"]=times

time_test = df_test['DISCOVERY_TIME'].to_numpy()
time_test = (time_test - np.average(goodtimes))/np.std(goodtimes)
time_test[np.isnan(time_test)]=-1
test_data["SCALED_TIME"] = time_test

#********************** END DATA EDITS** ***************************************

'''Test any others you're interested in here/below.
#Remember, anything you do to training must be done to testing, and any scaling,
#subtraction, multiplication, etc must be done with the values found on the
#TRAINING set, even onto the test set'''

#Peek at data now
input_data[0:10]

Unnamed: 0_level_0,LATITUDE,LONGITUDE,FIRE_SIZE,LABEL,DAY_OF_YR,CA,GA,FIPNAME_CAT,FIPCOD_CAT,SRCNAM_CAT,SCALED_TIME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,38.205,-120.335,0.1,1,1,1,0,-1,-1,157,-3.095307
1,33.8131,-85.1043,1.17,4,1,0,1,82,72,71,-0.728639
2,32.201,-82.4987,0.07,2,1,0,1,130,104,71,0.436674
3,32.5093,-81.7086,4.4,4,1,0,1,19,16,71,-0.488369
4,33.663889,-116.171944,0.2,2,1,1,0,-1,-1,14,-1.0
5,33.1667,-116.6342,5.0,2,1,1,0,-1,-1,156,-0.212057
6,31.2758,-83.7555,3.2,3,1,0,1,40,36,71,0.196403
7,34.5329,-85.0082,9.6,3,1,0,1,75,65,71,0.436674
8,32.1325,-82.761,0.58,4,1,0,1,206,154,71,-3.383631
9,32.2009,-82.2967,1.02,4,1,0,1,187,139,71,-3.395645


# DATA: CLEANING/SCALING

In [4]:
#************* KNOWN CLEANING PROCEDURES ***************************************
#Scale most numerical predictor variables to be within the same range
# (Do not do this for categorical data)
latmean = input_data['LATITUDE'].mean()
latstd = input_data['LATITUDE'].std()
lonmean = input_data['LONGITUDE'].mean()
lonstd = input_data['LONGITUDE'].std()
daymax = input_data['DAY_OF_YR'].max()
input_data['LATITUDE']=(input_data['LATITUDE']-latmean)/latstd
input_data['LONGITUDE']=(input_data['LONGITUDE']-lonmean)/lonstd
input_data['DAY_OF_YR']=input_data['DAY_OF_YR']/daymax

test_data['LATITUDE'] = (test_data['LATITUDE']-latmean)/latstd
test_data['LONGITUDE'] = (test_data['LONGITUDE']-lonmean)/lonstd
test_data['DAY_OF_YR']=test_data['DAY_OF_YR']/daymax

#Remove highly-correlated predictors, unless explicitly desired
input_data.corr()
# I choose to drop none and keep them (see comment on the FIPS_NAME above)

#*******************************************************************************

'''Test any others you're interested in below. Same warning applies for doing the
same to the test set as you would to the training.'''

#OTHER OPTIONS TO CONSIDER:
# - NaN-replacement (replace NaNs with numerical mean/median or cateogorical mode)

input_data

Unnamed: 0_level_0,LATITUDE,LONGITUDE,FIRE_SIZE,LABEL,DAY_OF_YR,CA,GA,FIPNAME_CAT,FIPCOD_CAT,SRCNAM_CAT,SCALED_TIME
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,0.988290,-0.985736,0.10,1,0.002732,1,0,-1,-1,157,-3.095307
1,-0.401536,0.925103,1.17,4,0.002732,0,1,82,72,71,-0.728639
2,-0.911689,1.066425,0.07,2,0.002732,0,1,130,104,71,0.436674
3,-0.814127,1.109278,4.40,4,0.002732,0,1,19,16,71,-0.488369
4,-0.448754,-0.759941,0.20,2,0.002732,1,0,-1,-1,14,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...
285377,-0.861656,1.044990,0.25,4,0.994536,0,1,105,88,55,-1.000000
285378,1.033018,-1.013965,0.10,2,0.994536,1,0,-1,-1,0,-0.930467
285379,-0.728275,1.053206,0.01,2,0.994536,0,1,97,84,53,-1.000000
285380,-0.347138,-0.800242,0.30,2,0.994536,1,0,-1,-1,14,-1.000000


#SET-UP FOR MODEL TRAINING

In [5]:
#Now that we have usable data, we should split the resulting data into sets for cross-validation, and separate by input/output
from sklearn.model_selection import KFold

#Use standard 5-fold cross-validation
nsplits = 5
kf = KFold(n_splits = nsplits, shuffle = True)

output = input_data['LABEL']
input = input_data.drop(columns = 'LABEL')

#TRAIN/EVALUATE A CHOSEN MODEL TYPE

In [6]:
from sklearn.metrics import roc_auc_score as evalmetric
from sklearn.ensemble import RandomForestClassifier

results = np.zeros(nsplits)
idx = 0
clfs = []
for train_index, test_index in kf.split(input_data):
     print("ON SET: ", idx+1)
     X_train, X_test = input.iloc[train_index], input.iloc[test_index]
     y_train, y_test = output.iloc[train_index], output.iloc[test_index]
     
     #METHOD 1: This one tries a random-forest
     clf = RandomForestClassifier(n_estimators=400)
     clf.fit(X_train, y_train)
     ''' for a more complex model, we'll have to scrap storing the models
         in RAM, unless someone wants to try this locally. alternatively, one 
         could store one model and update it if the evaluation score improves.
         Leaving this commented out just means we use the last model.'''
     #clfs.append(clf)
     predictedlabels = clf.predict(X_test)

     #METHOD 2: Try a Pytorch Neural Net
     '''
          Implement here if you want to try.
     '''

     #After choosing model to train (comment out others), calculate eval metric     
     testlabels = y_test.to_numpy()
     metric = evalmetric(testlabels, clf.predict_proba(X_test), multi_class = "ovr")
     results[idx] = metric
     idx = idx + 1



ON SET:  1
ON SET:  2
ON SET:  3
ON SET:  4
ON SET:  5


#SUMMARY OF TRAINING

In [7]:
#Add whatever other summarries you want to add
#Benchmark to beat by TAs is 75.3%

print(results)
print("Average success across kfolds: ", np.average(results))

#********************  BEST MODELS SO FAR **************************************
#   - CREATED COLUMNS: DAY_OF_YR
#     DROPPED FEATURES: FIPS_NAME (see comment in "FORMATTING")
#     MODEL: 400-tree-forest with 5-fold cross verification
#     MODEL NOTES: standard model (gini index, etc, no arguments passed)
#     SCORE: 0.827
#     ANY OTHER NOTES: Did not append clfs, took up too much RAM and only used last one
#
#   - CREATED COLUMNS: MONTH
#     DROPPED FEATURES: FIPS_NAME (see comment in "FORMATTING")
#     MODEL: 100-tree-forest with 4-fold cross verification
#     MODEL NOTES: standard model (gini index, etc, no arguments passed)
#     SCORE: 0.813
#     ANY OTHER NOTES: Forgot to append clfs
#
#   - Add another here!
#*******************************************************************************

[0.82732412 0.82669257 0.82787539 0.82595063 0.82830263]
Average success across kfolds:  0.8272290674076851


#GENERATE TESTING FILE AND REPORT TEST METRICS



In [36]:
#Now that we know the model type we would like, use it to generate the output file:

'''only uncomment if you've stored all the clfs'''
#used_clf = clfs[np.argmax(results)]
#probs = used_clf.predict_proba(test_data)

'''use this if you haven't stored the clfs'''
probs = clf.predict_proba(test_data)
print(probs)

#WRITE TO CSV
output = pd.DataFrame(probs, columns = ["P1", "P2", "P3", "P4"])
output['id'] = test_data.index
output = output.set_index('id')
output.to_csv("test_predictions.csv")

[[0.065  0.5125 0.125  0.2975]
 [0.035  0.315  0.2275 0.4225]
 [0.095  0.49   0.1075 0.3075]
 ...
 [0.1725 0.3425 0.05   0.435 ]
 [0.02   0.5025 0.12   0.3575]
 [0.085  0.3525 0.0975 0.465 ]]
