In [1]:
import numpy as np
import pandas as pd
import math
import json
import datetime

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesClassifier
from sklearn import preprocessing
import pickle
import joblib

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../../../data/fire_archive_M6_157443.csv')
df["confidence"] = df["confidence"].fillna(0).dropna()
df

Unnamed: 0,latitude,longitude,brightness,acq_date,confidence,bright_t31,type
0,-15.8931,136.6094,324.4,1/01/2015,28,302.3,0
1,-14.9764,145.2801,320.6,1/01/2015,24,294.8,0
2,-18.5115,139.5995,331.8,1/01/2015,37,305.0,0
3,-17.1160,122.2857,315.7,1/01/2015,0,294.9,0
4,-18.0797,122.6967,313.7,1/01/2015,31,292.9,0
...,...,...,...,...,...,...,...
671992,-22.4415,134.5244,314.9,31/12/2017,88,296.6,0
671993,-22.4800,134.5289,308.8,31/12/2017,69,295.7,0
671994,-22.4323,134.5258,313.8,31/12/2017,86,296.4,0
671995,-22.4707,134.5303,313.1,31/12/2017,83,296.7,0


In [3]:
df['month'] = pd.DataFrame(pd.DatetimeIndex(df['acq_date']).month)

In [4]:
df.dtypes

latitude      float64
longitude     float64
brightness    float64
acq_date       object
confidence      int64
bright_t31    float64
type            int64
month           int64
dtype: object

In [5]:

cols_to_pick_reg = ['latitude', 'longitude', "month", "brightness", "bright_t31"]
#cols_to_pick_reg = ["MinTemp", "MaxTemp", "AvgTemp", "Rainfall", 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Windspeed',  
#'Humidity9am', 'Humidity3pm', 'Humidity', 'Pressure9am', 'Pressure3pm', 'Pressure', 'Temp9am', 'Temp3pm', 'lat', 'long', 'Month']

all_X_reg = df.loc[:300000, cols_to_pick_reg] # not accounting for wind directions right now
all_Y_reg = df.loc[:300000, "confidence"] # confidence data

In [6]:

# x = np.array(all_X_reg["lat"])
# y = np.array(all_Y_reg)
# print(x.dtype, y.dtype)

# plt.plot(x, y)
# plt.show()

In [7]:
x_reg = all_X_reg.values #returns a numpy array
min_max_scaler_reg = preprocessing.StandardScaler()
x_scaled_reg = min_max_scaler_reg.fit_transform(x_reg)
all_X_reg = pd.DataFrame(x_scaled_reg)
all_X_reg

Unnamed: 0,0,1,2,3,4
0,0.569644,0.359934,-1.769630,-0.332255,-0.105983
1,0.692364,1.289738,-1.769630,-0.518747,-0.885764
2,0.219116,0.680578,-1.769630,0.030914,0.174738
3,0.405933,-1.176070,-1.769630,-0.759224,-0.875367
4,0.276921,-1.131996,-1.769630,-0.857378,-1.083309
...,...,...,...,...,...
299996,1.163054,-0.035731,-0.524249,-1.092947,-0.864970
299997,0.954644,1.024942,-0.524249,-1.137116,-1.301647
299998,1.162211,-0.038176,-0.524249,-0.675793,-0.844176
299999,1.163456,-0.036096,-0.524249,-1.029147,-0.864970


In [8]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(all_X_reg, all_Y_reg, random_state=11)
clf_reg = GradientBoostingRegressor()
clf_reg.fit(X_train_reg, y_train_reg)
clf_reg.score(X_test_reg, y_test_reg) 

0.535620739992944

In [9]:
clf_reg.predict(df.loc[0:80, cols_to_pick_reg])

array([62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86478555,
       62.86478555, 62.86478555, 62.86478555, 62.86478555, 62.86

In [10]:
# pickle.dump(clf_reg, open("../pickles/gradient_regressor.sav", "wb"))
joblib.dump(min_max_scaler_reg, "../pickles/gradient_regressor_scaler.sav")