# Packages Required

In [11]:
import pandas as pd
import numpy as np
import time
from math import sqrt
import xgboost
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Data Pre-Processing

In [2]:
# Reading the forest fire data
df = pd.read_csv('forestfires.csv')

In [3]:
# Manually creating label encoding dictionary and mapping to the categorical variables
month_map = {'jan':1,'feb':2,'mar':3,'apr':4,'may':5,'jun':6,'jul':7,'aug':8,
            'sep':9,'oct':10,'nov':11,'dec':12}
day_map = {'sun':1,'mon':2,'tue':3,'wed':4,'thu':5,'fri':6,'sat':7} 

df['month'] = df['month'].replace(month_map)
df['day'] = df['day'].replace(day_map)

In [4]:
# Splitting the data into feature and labels
X = df.drop(['area'], axis = 1)
y = df['area']

In [5]:
# Applying log transformation on the area column
y_log = np.log(y + 1)

# 10 Fold Cross-Validation Function

In [6]:
##########################################################################
# This function inputs:
# reg - regression model
# x_trn - independent variables
# y_trn - target variable
# y_log_trn - log transformed target varaible
# k - k folds
# scale - Given as True if feature scaling is required to be performed on train and test folds
# pca - Given as True if pca has to be performed on train and test folds
############################################################################
# Function Output:
# Returns the average of k fold iterations for the following evaluation metrics:
# MSE, NLL, RMSE, MAD for test fold data
# RMSE for train fold data
###########################################################################
def k_fold_cv(reg, x_trn,y_trn,y_log_trn,k, scale = False, pca = False):
    train_error_rmse = [] # assigns lists to capture the evaluation metrics of k fold validation
    test_error_rmse = []
    train_error_mad = []
    test_error_mad = []
    test_error_mse = []
    nll_list = []
    cv = KFold(n_splits=k, random_state=42, shuffle=True) # Random state is assigned to keep the split
    for train_index, test_index in cv.split(x_trn):       # consistent for all model 
        X_train, X_test, y_train, y_test,y_log_train = x_trn.iloc[train_index,], x_trn.iloc[test_index,],  \
                          y_trn.iloc[train_index,], y_trn.iloc[test_index,],y_log_trn.iloc[train_index,]

        if pca: # feature scaling and pca is applied
            scale = StandardScaler()
            scale.fit(X_train)
            X_train = scale.transform(X_train)
            X_test = scale.transform(X_test)
            pca = PCA(0.8) # overall variance of pca to capture
            pca.fit(X_train)

            X_train = pca.transform(X_train)
            X_test = pca.transform(X_test)
        elif scale: # feature scaling is applied
            scale = StandardScaler()
            X_train = pd.DataFrame(scale.fit_transform(X_train))
            X_test = pd.DataFrame(scale.transform(X_test))
            
        reg.fit(X_train, y_log_train) # Fitting the model with k-1 fold train data
        
        pred_train = reg.predict(X_train) # predicitng the output of k-1 fold train data
        pred_test = reg.predict(X_test) # predicting the output of 1 fold test data

        # Applying inverse of log tranformation on predicted value 
        # to evaluate the error between actual and predicted value
        pred_train = 10**pred_train-1
        pred_test = 10**pred_test-1
        
        pred_train[pred_train < 0] = 0 # after inverse of log, some predicted values tend to be negative
        pred_test[pred_test < 0] = 0 # they are assigned to be 0 since burned area cannot be negative
        
        ######## Calculating negative log likelihood
        var = np.var(pred_test)
        mean = np.mean(pred_test)
        sum_error = 0
        for i in range(0,len(pred_test)):
            sum_error = sum_error + (y_test.iloc[i] - pred_test[i])**2 # sum of (actual - pred)^2
            
        nll = (len(pred_test)*0.5*np.log(2*3.14*var)) + (sum_error/(2*var))
        nll_list.append(nll)
        ######## calculating RMSE of train and test data
        train_error_rmse.append(sqrt(mean_squared_error(y_train, pred_train)))
        test_error_rmse.append(sqrt(mean_squared_error(y_test, pred_test)))
        ######## calculating MAD of test data
        test_error_mad.append(mean_absolute_error(y_test, pred_test))
        ######## calculating MSE of test data
        test_error_mse.append(mean_squared_error(y_test, pred_test))
    return np.average(train_error_rmse), np.average(test_error_rmse), np.average(test_error_mad),\
                np.average(test_error_mse), np.average(nll_list)
            

# XGBoost Linear Gradient Boosting Model

In [12]:
start_time = time.time()

regressor = xgboost.XGBRegressor(booster = 'gblinear')
train_error_rmse, test_error_rmse,test_error_mad,test_error_mse,nll = k_fold_cv(regressor, X,y,y_log,10)

print("MSE of XGBoost gblinear test data:",test_error_mse)
print("NLL of XGBoost gblinear test data:",nll)
print("RMSE of XGBoost gblinear Test data:",test_error_rmse)
print("MAD of XGBoost gblinear Test data:",test_error_mad)

elapsed_time = time.time()-start_time
print("Run time:"+time.strftime("%H%M%S",time.gmtime(elapsed_time)))



  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


MSE of XGBoost gblinear test data: 4006.7442689268246
NLL of XGBoost gblinear test data: 5085.154343751
RMSE of XGBoost gblinear Test data: 46.45419051716742
MAD of XGBoost gblinear Test data: 18.649363506903295
Run time:000001


  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
