In [85]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np

In [99]:
cwd = os.getcwd()
columns = ['temperature', 'relative_humidity', 'wind_speed', 'wind_direction', 'total_precipitation', 'volumetric_soil_water_layer_1','Counties_Idx' ]
weather_wildfire_data_url = cwd + r"/Cleaned Data/Weather_California_Fire_Incidents.csv"
fire_data = pd.read_csv(weather_wildfire_data_url)
fire_data['Fire'] = 1


In [100]:
print(fire_data)

      AcresBurned         Counties               Started Started_Date  \
0        257314.0     Tuolumne, CA  2013-08-17T15:25:00Z   2013-08-17   
1         30274.0  Los Angeles, CA  2013-05-30T15:28:00Z   2013-05-30   
2         27531.0    Riverside, CA  2013-07-15T13:43:00Z   2013-07-15   
3         27440.0       Placer, CA  2013-08-10T16:30:00Z   2013-08-10   
4         24251.0      Ventura, CA  2013-05-02T07:01:00Z   2013-05-02   
...           ...              ...                   ...          ...   
1595         10.0    Riverside, CA  2019-09-10T10:43:58Z   2019-09-10   
1596         10.0        Butte, CA  2019-07-23T14:41:00Z   2019-07-23   
1597         10.0     Siskiyou, CA  2019-06-16T20:33:00Z   2019-06-16   
1598         10.0        Butte, CA  2019-04-30T12:20:00Z   2019-04-30   
1599          9.0    Riverside, CA  2019-10-10T12:08:00Z   2019-10-10   

      Started_Hour  temperature  relative_humidity  wind_speed  \
0               15        21.28               0.29       

In [101]:
counties = fire_data["Counties"].unique()
for county in counties:
    fname = f"{cwd}/Cleaned Data/{county}/{county}_Fire_Incidents.csv"
    df = pd.read_csv(fname)
    df["Counties"] = county
    df.to_csv(fname, index=False)

In [102]:
counties = fire_data["Counties"].unique()
files = []
for county in counties:
    fname = f"{cwd}/Cleaned Data/{county}/{county}_Fire_Incidents.csv"
    files.append(fname)

no_fire_data = pd.concat(map(pd.read_csv, files), ignore_index=True)
no_fire_data['Fire'] = 0
print(no_fire_data)

     Started_Date  temperature  relative_humidity  wind_speed  wind_direction  \
0      2013-01-15         0.01               0.62        2.32           69.71   
1      2013-02-15         0.10               0.84        4.28           64.43   
2      2013-03-15        10.22               0.76        1.53           55.64   
3      2013-04-15         8.15               0.72        0.72          216.74   
4      2013-05-15        14.27               0.71        0.86           50.09   
...           ...          ...                ...         ...             ...   
3394   2019-07-15        23.00               0.66        4.09            9.59   
3395   2019-08-15        25.22               0.82        1.67            8.64   
3396   2019-09-15        23.63               0.56       11.14           21.09   
3397   2019-11-15        18.77               0.76        3.44          166.85   
3398   2019-12-15        16.69               0.69        3.25           41.32   

      total_precipitation  

In [103]:
data = pd.concat([fire_data, no_fire_data], ignore_index=True, sort=False)
url = cwd + r"/Cleaned Data/Combined_Weather_California_Fire_Incidents.csv"
data.to_csv(url, index=False)

In [104]:
print(data)

      AcresBurned         Counties               Started Started_Date  \
0        257314.0     Tuolumne, CA  2013-08-17T15:25:00Z   2013-08-17   
1         30274.0  Los Angeles, CA  2013-05-30T15:28:00Z   2013-05-30   
2         27531.0    Riverside, CA  2013-07-15T13:43:00Z   2013-07-15   
3         27440.0       Placer, CA  2013-08-10T16:30:00Z   2013-08-10   
4         24251.0      Ventura, CA  2013-05-02T07:01:00Z   2013-05-02   
...           ...              ...                   ...          ...   
4994          NaN        Marin, CA                   NaN   2019-07-15   
4995          NaN        Marin, CA                   NaN   2019-08-15   
4996          NaN        Marin, CA                   NaN   2019-09-15   
4997          NaN        Marin, CA                   NaN   2019-11-15   
4998          NaN        Marin, CA                   NaN   2019-12-15   

      Started_Hour  temperature  relative_humidity  wind_speed  \
0             15.0        21.28               0.29       

In [105]:
data['Counties_Idx'] = data.apply (lambda row: np.where(row['Counties'] == counties)[0][0], axis=1)
print(data)

      AcresBurned         Counties               Started Started_Date  \
0        257314.0     Tuolumne, CA  2013-08-17T15:25:00Z   2013-08-17   
1         30274.0  Los Angeles, CA  2013-05-30T15:28:00Z   2013-05-30   
2         27531.0    Riverside, CA  2013-07-15T13:43:00Z   2013-07-15   
3         27440.0       Placer, CA  2013-08-10T16:30:00Z   2013-08-10   
4         24251.0      Ventura, CA  2013-05-02T07:01:00Z   2013-05-02   
...           ...              ...                   ...          ...   
4994          NaN        Marin, CA                   NaN   2019-07-15   
4995          NaN        Marin, CA                   NaN   2019-08-15   
4996          NaN        Marin, CA                   NaN   2019-09-15   
4997          NaN        Marin, CA                   NaN   2019-11-15   
4998          NaN        Marin, CA                   NaN   2019-12-15   

      Started_Hour  temperature  relative_humidity  wind_speed  \
0             15.0        21.28               0.29       

In [106]:
X = data[columns].values
y = data[['Fire']].values
y = y.reshape(y.shape[0])

In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [120]:
regressor = LogisticRegression(max_iter=500)
regressor.fit(X_train, y_train)

LogisticRegression(max_iter=500)

In [121]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(regressor.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.78
