In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sbs

In [2]:
data = pd.read_csv("clustered_forestfires.csv")
data.loc[:, "log_area"] = np.log(data["area"] + 1)

In [3]:
data.loc[:, "month_type"] = 1
data.loc[(data["month"] == "sep") | (data["month"] == "aug"), "month_type"] = 2

In [4]:
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error

In [5]:
X = data[["month_type", "FFMC", "DMC", "ISI", "temp", "wind", "DC"]].to_numpy()
X = sm.add_constant(X)
y = data["log_area"]

# Experiment 1

In [7]:
formula = "log_area ~ month_type + FFMC + DMC + ISI + temp + wind + DC"

mod1 = smf.glm(formula=formula, data=data, family=sm.families.Gaussian()).fit()
print(mod1.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               log_area   No. Observations:                  517
Model:                            GLM   Df Residuals:                      509
Model Family:                Gaussian   Df Model:                            7
Link Function:               identity   Scale:                          1.9469
Method:                          IRLS   Log-Likelihood:                -901.78
Date:                Sun, 18 Jul 2021   Deviance:                       990.96
Time:                        14:04:22   Pearson chi2:                     991.
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.4163      1.188     -0.350      0.7

In [17]:
y_pred_1 = mod1.predict(X)

# Experiment 2

In [9]:
formula = "log_area ~ (FFMC + DMC + ISI + temp + wind + DC)*month_type"

mod2 = smf.glm(formula=formula, data=data, family=sm.families.Gaussian()).fit()
print(mod2.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               log_area   No. Observations:                  517
Model:                            GLM   Df Residuals:                      503
Model Family:                Gaussian   Df Model:                           13
Link Function:               identity   Scale:                          1.9481
Method:                          IRLS   Log-Likelihood:                -898.87
Date:                Sun, 18 Jul 2021   Deviance:                       979.88
Time:                        14:04:22   Pearson chi2:                     980.
No. Iterations:                     3                                         
Covariance Type:            nonrobust                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept           0.0792      3.620     

In [None]:
y_pred_2 = predict(mod2, X)
mse = mean_squared_error(y, y_pred_2)
print(mse)

# Experiment 3

In [None]:
formula = "log_area ~ temp*month_type"

mod3 = smf.glm(formula=formula, data=data, family=sm.families.Gaussian()).fit()
print(mod3.summary())

In [None]:
y_pred_3 = predict(mod3, X)
mse = mean_squared_error(y, y_pred_3)
print(mse)