In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
file = 'Federal Firefighting Costs (Suppression Only).csv'
data = pd.read_csv(file)
df = pd.DataFrame(data)

## Cleanup ##

In [3]:
df['ForestService'] = df['ForestService'].str.strip('$')
df['DOIAgencies'] = df['DOIAgencies'].str.strip('$')
df['Total'] = df['Total'].str.strip('$')
df['Currency'] = 'USD'
df['Country'] = 'United States'
df = df.rename(columns = {'Total':'Total Fire Suppression Cost', 'Acres':'Acres Burnt'})
df['Fires'] = df['Fires'].str.replace(",", "", regex=True)
df['Acres Burnt'] = df['Acres Burnt'].str.replace(",", "", regex=True)
df['Total Fire Suppression Cost'] = df['Total Fire Suppression Cost'].str.replace(",", "", regex=True)
df['Fires'] = df['Fires'].astype(int)
df['Acres Burnt'] = df['Acres Burnt'].astype(int)
df['Total Fire Suppression Cost'] = df['Total Fire Suppression Cost'].astype(int)

## Create Additional Fields ##

In [4]:
df['Acres Burnt per Fire'] = round(df['Acres Burnt']/df['Fires'], 2)
df['Suppression Cost per Fire'] = round(df['Total Fire Suppression Cost']/df['Fires'], 2)
df['Year over Year Change in Fire Suppression Cost'] = df['Total Fire Suppression Cost'].diff(1)
df['Year over Year Change in Fire Suppression Cost'].fillna(value='0', inplace=True)
df['Year over Year Change in Fire Suppression Cost'] = df['Year over Year Change in Fire Suppression Cost'].astype(int)
df = df[['Country', 'Year', 'Fires', 'Acres Burnt', 'Total Fire Suppression Cost', 'Acres Burnt per Fire', 'Suppression Cost per Fire', 'Year over Year Change in Fire Suppression Cost', 'Currency']]
df.head()

Unnamed: 0,Country,Year,Fires,Acres Burnt,Total Fire Suppression Cost,Acres Burnt per Fire,Suppression Cost per Fire,Year over Year Change in Fire Suppression Cost,Currency
0,United States,1985,82591,2896147,239943000,35.07,2905.2,0,USD
1,United States,1986,85907,2719162,202778000,31.65,2360.44,-37165000,USD
2,United States,1987,71300,2447296,335109000,34.32,4699.99,132331000,USD
3,United States,1988,72750,5009290,578926000,68.86,7957.75,243817000,USD
4,United States,1989,48949,1827310,499787000,37.33,10210.36,-79139000,USD


## Create Dummy Variable ##

In [5]:
df.shape

(36, 9)

In [6]:
df.dtypes

Country                                            object
Year                                                int64
Fires                                               int64
Acres Burnt                                         int64
Total Fire Suppression Cost                         int64
Acres Burnt per Fire                              float64
Suppression Cost per Fire                         float64
Year over Year Change in Fire Suppression Cost      int64
Currency                                           object
dtype: object

Looking at the data types of the fields in the dataset, there is no categorical variable that can be leveraged to create a dummy variable. As a result, this step is skipped.

## Standard Scaler ##

In [7]:
# Drop 'Country' and 'Currency' columns as they do not need to be standarized.
df = df.drop(columns='Country', axis=1)
df = df.drop(columns='Currency', axis=1)

In [8]:
df.dtypes

Year                                                int64
Fires                                               int64
Acres Burnt                                         int64
Total Fire Suppression Cost                         int64
Acres Burnt per Fire                              float64
Suppression Cost per Fire                         float64
Year over Year Change in Fire Suppression Cost      int64
dtype: object

In [9]:
# X, the dependent feature, consists of 'Fires' and 'Acres Burnt.' It is set with y in mind.
# Y, the independent variable, which is 'Total Fire Suppression Cost' 
# Split the data into features (X) and target (y)

X = df.iloc[:, [1,2]]
y = df.iloc[:, 3]

X.head()

Unnamed: 0,Fires,Acres Burnt
0,82591,2896147
1,85907,2719162
2,71300,2447296
3,72750,5009290
4,48949,1827310


X, y is configured as is since I want to predict the annual fire suppression cost (y) based on the number of fires and the acres burnt each year.

In [10]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,Fires,Acres Burnt
8,58810,1797574
17,73457,7184712
9,79107,4073579
34,50477,4664364
0,82591,2896147


In [11]:
X_test.head()

Unnamed: 0,Fires,Acres Burnt
35,58950,10122336
13,81043,1329704
26,74126,8711367
30,68151,10125149
16,84079,3570911


In [12]:
# Scale the features using StandardScaler
scaler = StandardScaler()

# Transform features such that its distribution will have a mean value of 0 and standard deviation of 1.
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
np.mean(X_train)

7.137148015447435e-17

The mean of the standardized train feature is 0.

In [14]:
np.std(X_train)

1.0

The standard deviation of the standardized train feature is 1.

In [15]:
# X_test mean and standard deviation
print(X_test.mean(axis=0))
print(X_test.std(axis=0))

[0.14464062 0.54267301]
[0.87462867 1.34271461]


The means of the standarized test variable is 0.1446 (for 'Fires') and 0.5427 (for 'Acres Burnt'). The standard deviations of the standarized test variable is 0.8746 (for 'Fires') and 1.3427 (for 'Acres Burnt').