In [73]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

%matplotlib inline

In [46]:
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,fire
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0


In [40]:
df = pd.read_csv('./data/forestfires.csv')

### data dictionary
1. X - x-axis spatial coordinate within the Montesinho park map: 1 to 9
2. Y - y-axis spatial coordinate within the Montesinho park map: 2 to 9
3. month - month of the year: "jan" to "dec" 
4. day - day of the week: "mon" to "sun"
5. FFMC - FFMC index from the FWI system: 18.7 to 96.20
6. DMC - DMC index from the FWI system: 1.1 to 291.3 
7. DC - DC index from the FWI system: 7.9 to 860.6 
8. ISI - ISI index from the FWI system: 0.0 to 56.10
9. temp - temperature in Celsius degrees: 2.2 to 33.30
10. RH - relative humidity in %: 15.0 to 100
11. wind - wind speed in km/h: 0.40 to 9.40 
12. rain - outside rain in mm/m2 : 0.0 to 6.4 
13. area - the burned area of the forest (in ha): 0.00 to 1090.84 
(this output variable is very skewed towards 0.0, thus it may make
sense to model with the logarithm transform). 

## Transform

In [41]:
df['fire'] = (df.area > 0 ) + 0

In [43]:
df.drop('area', 1, inplace= True)

In [49]:
a = pd.get_dummies(df.month)
b = pd.get_dummies(df.day)
df.drop(['month','day'],1, inplace= True)

In [55]:
df = pd.concat([df, a, b], axis = 1)
y = df.fire
X = df.drop('fire', 1)

## train test split

In [59]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = .15, random_state = 100)

In [58]:
X_train.shape, y_train.shape

((439, 48), (439,))

In [66]:
clf = LogisticRegression(class_weight='balanced')
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [68]:
print(classification_report(y_pred = clf.predict(X_test), y_true = y_test))

             precision    recall  f1-score   support

          0       0.42      0.47      0.44        38
          1       0.43      0.38      0.40        40

avg / total       0.42      0.42      0.42        78



In [74]:
confusion_matrix(y_pred = clf.predict(X_test), y_true = y_test)b

array([[18, 20],
       [25, 15]])

In [89]:
thres = .4
y_pred = [i[0] < thres for i in clf.predict_proba(X_test)]

confusion_matrix(y_pred = y_pred, y_true = y_test)

array([[32,  6],
       [37,  3]])