In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
file = 'Federal Firefighting Costs (Suppression Only).csv'
data = pd.read_csv(file)
df = pd.DataFrame(data)

## Cleanup ##

In [3]:
df['ForestService'] = df['ForestService'].str.strip('$')
df['DOIAgencies'] = df['DOIAgencies'].str.strip('$')
df['Total'] = df['Total'].str.strip('$')
df['Currency'] = 'USD'
df['Country'] = 'United States'
df = df.rename(columns = {'Total':'Total Fire Suppression Cost', 'Acres':'Acres Burnt'})
df['Fires'] = df['Fires'].str.replace(",", "", regex=True)
df['Acres Burnt'] = df['Acres Burnt'].str.replace(",", "", regex=True)
df['Total Fire Suppression Cost'] = df['Total Fire Suppression Cost'].str.replace(",", "", regex=True)
df['Fires'] = df['Fires'].astype(int)
df['Acres Burnt'] = df['Acres Burnt'].astype(int)
df['Total Fire Suppression Cost'] = df['Total Fire Suppression Cost'].astype(int)

$ Convert $ $ Necessary $ $ Fields $ $ to $ $ Same $ $ Unit $ $ (Millions) $

In [4]:
# Convert Fires which is in thousands to millions
df['Fires'] = round((df['Fires'] / 1000000), 4)

# Convert Total Fire Suppression Cost which contains millions and billions to millions
df['Total Fire Suppression Cost'] = round((df['Total Fire Suppression Cost'] * 1000), 4)

## Create Additional Fields ##

In [5]:
df['Acres Burnt per Fire'] = round(df['Acres Burnt']/df['Fires'], 2)
df['Suppression Cost per Fire'] = df['Total Fire Suppression Cost']/df['Fires']
df['Year over Year Change in Fire Suppression Cost'] = df['Total Fire Suppression Cost'].diff(1)
df['Year over Year Change in Fire Suppression Cost'].fillna(value='0', inplace=True)
df['Year over Year Change in Fire Suppression Cost'] = df['Year over Year Change in Fire Suppression Cost'].astype(int)
df = df[['Country', 'Year', 'Fires', 'Acres Burnt', 'Total Fire Suppression Cost', 'Acres Burnt per Fire', 'Suppression Cost per Fire', 'Year over Year Change in Fire Suppression Cost', 'Currency']]
df.head()

Unnamed: 0,Country,Year,Fires,Acres Burnt,Total Fire Suppression Cost,Acres Burnt per Fire,Suppression Cost per Fire,Year over Year Change in Fire Suppression Cost,Currency
0,United States,1985,0.0826,2896147,239943000000,35062312.35,2904879000000.0,0,USD
1,United States,1986,0.0859,2719162,202778000000,31654970.9,2360629000000.0,-37165000000,USD
2,United States,1987,0.0713,2447296,335109000000,34323927.07,4699986000000.0,132331000000,USD
3,United States,1988,0.0728,5009290,578926000000,68808928.57,7952280000000.0,243817000000,USD
4,United States,1989,0.0489,1827310,499787000000,37368302.66,10220590000000.0,-79139000000,USD


## Create Dummy Variable ##

In [6]:
df.shape

(36, 9)

In [7]:
df.dtypes

Country                                            object
Year                                                int64
Fires                                             float64
Acres Burnt                                         int64
Total Fire Suppression Cost                         int64
Acres Burnt per Fire                              float64
Suppression Cost per Fire                         float64
Year over Year Change in Fire Suppression Cost      int64
Currency                                           object
dtype: object

Looking at the data types of the fields in the dataset, there is no categorical variable that can be leveraged to create a dummy variable. As a result, this step is skipped.

## Standard Scaler ##

In [8]:
# Drop 'Country' and 'Currency' features as they do not need to be standarized.
df = df.drop(columns='Country', axis=1)
df = df.drop(columns='Currency', axis=1)

In [9]:
df.dtypes

Year                                                int64
Fires                                             float64
Acres Burnt                                         int64
Total Fire Suppression Cost                         int64
Acres Burnt per Fire                              float64
Suppression Cost per Fire                         float64
Year over Year Change in Fire Suppression Cost      int64
dtype: object

In [10]:
# X, the dependent feature, consists of 'Fires' and 'Acres Burnt.' It is set with y in mind.
# Y, the independent variable, which is 'Total Fire Suppression Cost' 
# Split the data into features (X) and target (y)
X = df.drop('Acres Burnt per Fire', axis = 1)
y = df['Acres Burnt per Fire']

X.head()

Unnamed: 0,Year,Fires,Acres Burnt,Total Fire Suppression Cost,Suppression Cost per Fire,Year over Year Change in Fire Suppression Cost
0,1985,0.0826,2896147,239943000000,2904879000000.0,0
1,1986,0.0859,2719162,202778000000,2360629000000.0,-37165000000
2,1987,0.0713,2447296,335109000000,4699986000000.0,132331000000
3,1988,0.0728,5009290,578926000000,7952280000000.0,243817000000
4,1989,0.0489,1827310,499787000000,10220590000000.0,-79139000000


X, y is configured as is since I want to predict the annual fire suppression cost (y) based on the number of fires and the acres burnt each year.

In [11]:
# Split the data into training and test sets. 20% testing, 80% training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Year,Fires,Acres Burnt,Total Fire Suppression Cost,Suppression Cost per Fire,Year over Year Change in Fire Suppression Cost
8,1993,0.0588,1797574,240436000000,4089048000000.0,-137030000000
17,2002,0.0735,7184712,1674040000000,22776050000000.0,721344000000
9,1994,0.0791,4073579,918335000000,11609800000000.0,677899000000
34,2019,0.0505,4664364,1590000000000,31485150000000.0,-1553256000000
0,1985,0.0826,2896147,239943000000,2904879000000.0,0


In [12]:
X_test.head()

Unnamed: 0,Year,Fires,Acres Burnt,Total Fire Suppression Cost,Suppression Cost per Fire,Year over Year Change in Fire Suppression Cost
35,2020,0.059,10122336,2274000000000,38542370000000.0,684000000000
13,1998,0.081,1329704,416704000000,5144494000000.0,132556000000
26,2011,0.0741,8711367,1374525000000,18549600000000.0,565026000000
30,2015,0.0682,10125149,2130543000000,31239630000000.0,608394000000
16,2001,0.0841,3570911,952696000000,11328130000000.0,-458106000000


In [13]:
# Scale the features using StandardScaler
scaler = StandardScaler()

# Transform features such that its distribution will have a mean value of 0 and standard deviation of 1.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
np.mean(X_train_scaled)

1.4869058365515489e-15

The mean of the standardized train feature is 0.

In [15]:
np.std(X_train_scaled)

1.0

The standard deviation of the standardized train feature is 1.

In [16]:
# X_test mean and standard deviation
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))

[0.67934223 0.14467811 0.54267301 0.43077874 0.29994554 0.45982328]
[0.78928407 0.87390137 1.34271461 0.9411157  0.92271006 1.01147974]


In [17]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (28, 6)
Training Labels Shape: (28,)
Testing Features Shape: (8, 6)
Testing Labels Shape: (8,)
