In [1]:
import pandas as pd

df = pd.read_csv('../data/forestfires_cleaned.csv')
df.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0


In [5]:
X = df.drop('area', axis=1)
y = df['area']

X.head(), y.head()

(   X  Y month  day  FFMC   DMC     DC  ISI  temp  RH  wind  rain
 0  7  5   mar  fri  86.2  26.2   94.3  5.1   8.2  51   6.7   0.0
 1  7  4   oct  tue  90.6  35.4  669.1  6.7  18.0  33   0.9   0.0
 2  7  4   oct  sat  90.6  43.7  686.9  6.7  14.6  33   1.3   0.0
 3  8  6   mar  fri  91.7  33.3   77.5  9.0   8.3  97   4.0   0.2
 4  8  6   mar  sun  89.3  51.3  102.2  9.6  11.4  99   1.8   0.0,
 0    0
 1    0
 2    0
 3    0
 4    0
 Name: area, dtype: int64)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.30,        # 30% goes to temp
    random_state=42,
    stratify=y
)

In [8]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.50,        # half of the 30%
    random_state=42,
    stratify=y_temp
)

In [9]:
len(X_train), len(X_val), len(X_test)

(361, 77, 78)

In [10]:
categorical_cols = ['month', 'day']
numeric_cols = [col for col in X.columns if col not in categorical_cols]

categorical_cols, numeric_cols

(['month', 'day'],
 ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain'])

In [11]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

In [12]:
preprocessor.fit(X_train)

In [13]:
X_train_transformed = preprocessor.transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train_transformed, y_train)

In [16]:
y_val_pred = log_reg.predict(X_val_transformed)

In [17]:
val_accuracy = accuracy_score(y_val, y_val_pred)
val_accuracy

0.5974025974025974