In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn

## Data Processing

In [2]:
cwd = os.getcwd()
columns = ['temperature', 'relative_humidity', 'wind_speed', 'wind_direction', 'total_precipitation', 'volumetric_soil_water_layer_1','Counties_Idx' ]
weather_wildfire_data_url = cwd + r"/Cleaned Data/Weather_California_Fire_Incidents.csv"
fire_data = pd.read_csv(weather_wildfire_data_url)
fire_data['Fire'] = 1


In [3]:
print(fire_data)

      AcresBurned         Counties               Started Started_Date  \
0        257314.0     Tuolumne, CA  2013-08-17T15:25:00Z   2013-08-17   
1         30274.0  Los Angeles, CA  2013-05-30T15:28:00Z   2013-05-30   
2         27531.0    Riverside, CA  2013-07-15T13:43:00Z   2013-07-15   
3         27440.0       Placer, CA  2013-08-10T16:30:00Z   2013-08-10   
4         24251.0      Ventura, CA  2013-05-02T07:01:00Z   2013-05-02   
...           ...              ...                   ...          ...   
1595         10.0    Riverside, CA  2019-09-10T10:43:58Z   2019-09-10   
1596         10.0        Butte, CA  2019-07-23T14:41:00Z   2019-07-23   
1597         10.0     Siskiyou, CA  2019-06-16T20:33:00Z   2019-06-16   
1598         10.0        Butte, CA  2019-04-30T12:20:00Z   2019-04-30   
1599          9.0    Riverside, CA  2019-10-10T12:08:00Z   2019-10-10   

      Started_Hour  temperature  relative_humidity  wind_speed  \
0               15        21.28               0.29       

In [4]:
counties = fire_data["Counties"].unique()
for county in counties:
    fname = f"{cwd}/Cleaned Data/{county}/{county}_Fire_Incidents.csv"
    df = pd.read_csv(fname)
    df["Counties"] = county
    df.to_csv(fname, index=False)

In [5]:
counties = fire_data["Counties"].unique()
files = []
for county in counties:
    fname = f"{cwd}/Cleaned Data/{county}/{county}_Fire_Incidents.csv"
    files.append(fname)

no_fire_data = pd.concat(map(pd.read_csv, files), ignore_index=True)
no_fire_data['Fire'] = 0
print(no_fire_data)

     Started_Date  temperature  relative_humidity  wind_speed  wind_direction  \
0      2013-01-15         0.01               0.62        2.32           69.71   
1      2013-02-15         0.10               0.84        4.28           64.43   
2      2013-03-15        10.22               0.76        1.53           55.64   
3      2013-04-15         8.15               0.72        0.72          216.74   
4      2013-05-15        14.27               0.71        0.86           50.09   
...           ...          ...                ...         ...             ...   
3394   2019-07-15        23.00               0.66        4.09            9.59   
3395   2019-08-15        25.22               0.82        1.67            8.64   
3396   2019-09-15        23.63               0.56       11.14           21.09   
3397   2019-11-15        18.77               0.76        3.44          166.85   
3398   2019-12-15        16.69               0.69        3.25           41.32   

      total_precipitation  

In [6]:
data = pd.concat([fire_data, no_fire_data], ignore_index=True, sort=False)
url = cwd + r"/Cleaned Data/Combined_Weather_California_Fire_Incidents.csv"
data.to_csv(url, index=False)

In [7]:
print(data)

      AcresBurned         Counties               Started Started_Date  \
0        257314.0     Tuolumne, CA  2013-08-17T15:25:00Z   2013-08-17   
1         30274.0  Los Angeles, CA  2013-05-30T15:28:00Z   2013-05-30   
2         27531.0    Riverside, CA  2013-07-15T13:43:00Z   2013-07-15   
3         27440.0       Placer, CA  2013-08-10T16:30:00Z   2013-08-10   
4         24251.0      Ventura, CA  2013-05-02T07:01:00Z   2013-05-02   
...           ...              ...                   ...          ...   
4994          NaN        Marin, CA                   NaN   2019-07-15   
4995          NaN        Marin, CA                   NaN   2019-08-15   
4996          NaN        Marin, CA                   NaN   2019-09-15   
4997          NaN        Marin, CA                   NaN   2019-11-15   
4998          NaN        Marin, CA                   NaN   2019-12-15   

      Started_Hour  temperature  relative_humidity  wind_speed  \
0             15.0        21.28               0.29       

In [8]:
data['Counties_Idx'] = data.apply (lambda row: np.where(row['Counties'] == counties)[0][0], axis=1)
print(data)

      AcresBurned         Counties               Started Started_Date  \
0        257314.0     Tuolumne, CA  2013-08-17T15:25:00Z   2013-08-17   
1         30274.0  Los Angeles, CA  2013-05-30T15:28:00Z   2013-05-30   
2         27531.0    Riverside, CA  2013-07-15T13:43:00Z   2013-07-15   
3         27440.0       Placer, CA  2013-08-10T16:30:00Z   2013-08-10   
4         24251.0      Ventura, CA  2013-05-02T07:01:00Z   2013-05-02   
...           ...              ...                   ...          ...   
4994          NaN        Marin, CA                   NaN   2019-07-15   
4995          NaN        Marin, CA                   NaN   2019-08-15   
4996          NaN        Marin, CA                   NaN   2019-09-15   
4997          NaN        Marin, CA                   NaN   2019-11-15   
4998          NaN        Marin, CA                   NaN   2019-12-15   

      Started_Hour  temperature  relative_humidity  wind_speed  \
0             15.0        21.28               0.29       

In [9]:
X = data[columns].values
y = data[['Fire']].values
y = y.reshape(y.shape[0])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [11]:
xtrain_url = os.getcwd() + r"/Model Data/X Train.csv"
xtest_url = os.getcwd() + r"/Model Data/X Test.csv"
ytrain_url = os.getcwd() + r"/Model Data/Y Train.csv"
ytest_url = os.getcwd() + r"/Model Data/Y Test.csv"

folder_path = os.getcwd() + r"/Model Data"

if not os.path.exists(folder_path):
    os.makedirs(folder_path)

pd.DataFrame(columns = columns, data=X_train).to_csv(xtrain_url, index=False)
pd.DataFrame(columns = columns, data=X_test).to_csv(xtest_url, index=False)
pd.DataFrame(columns = ["Fire"], data=y_train).to_csv(ytrain_url, index=False)
pd.DataFrame(columns = ["Fire"], data=y_test).to_csv(ytest_url, index=False)

## Scipy Implementation

In [28]:
regressor = LogisticRegression(max_iter=500)
regressor.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [29]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(regressor.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.78


## MLP Implementation

In [93]:
class MLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(7, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20,1)
        )

    def forward(self, x):
        x = torch.sigmoid(self.layers(x))
        return x

In [94]:
X_train, X_test = torch.Tensor(X_train),torch.Tensor(X_test)
y_train, y_test = torch.Tensor(y_train),torch.Tensor(y_test)

In [102]:
model = MLP()
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.008)

iter = 0
for epoch in range(1500):
    x = X_train
    labels = y_train
    optimizer.zero_grad()
    outputs = model(X_train)
    loss = criterion(torch.squeeze(outputs), labels) 
    
    loss.backward()
    optimizer.step()
    
    iter+=1
    if iter%100==0:
        with torch.no_grad():
            correct_test = 0
            total_test = 0
            outputs_test = torch.squeeze(model(X_test))
            loss_test = criterion(outputs_test, y_test)
            
            predicted_test = outputs_test.round().detach().numpy()
            total_test += y_test.size(0)
            correct_test += np.sum(predicted_test == y_test.detach().numpy())
            accuracy_test = 100 * correct_test/total_test
            
            total = 0
            correct = 0
            total += y_train.size(0)
            correct += np.sum(torch.squeeze(outputs).round().detach().numpy() == y_train.detach().numpy())
            accuracy = 100 * correct/total
            
            print(f"Iteration: {iter}. \nTest - Loss: {loss_test.item()}. Accuracy: {accuracy_test}")
            print(f"Train -  Loss: {loss.item()}. Accuracy: {accuracy}\n")

Iteration: 100. 
Test - Loss: 0.5691699385643005. Accuracy: 69.9
Train -  Loss: 0.5771462917327881. Accuracy: 68.96724181045262

Iteration: 200. 
Test - Loss: 0.5504651069641113. Accuracy: 70.3
Train -  Loss: 0.5558959245681763. Accuracy: 70.76769192298075

Iteration: 300. 
Test - Loss: 0.5371274352073669. Accuracy: 70.9
Train -  Loss: 0.5412337183952332. Accuracy: 71.39284821205301

Iteration: 400. 
Test - Loss: 0.5279341340065002. Accuracy: 71.9
Train -  Loss: 0.5327427387237549. Accuracy: 72.26806701675419

Iteration: 500. 
Test - Loss: 0.5227728486061096. Accuracy: 72.1
Train -  Loss: 0.5272766351699829. Accuracy: 72.84321080270068

Iteration: 600. 
Test - Loss: 0.5169573426246643. Accuracy: 72.6
Train -  Loss: 0.5207401514053345. Accuracy: 73.36834208552138

Iteration: 700. 
Test - Loss: 0.5133823752403259. Accuracy: 73.0
Train -  Loss: 0.5157020688056946. Accuracy: 73.49337334333583

Iteration: 800. 
Test - Loss: 0.5083916187286377. Accuracy: 73.0
Train -  Loss: 0.509235978126525