In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error

In [3]:
df = pd.read_json('json_archive/stevens_data_master.json')

In [4]:
df = df.sort_values(by='Date/Time (PST)',ascending=True)
df.tail()

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),Relative Humidity (%),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
4,2019-04-30 19:00:00,13.58,37.24,0.088,9.15,20.58,283.8,2019-04-30,58.71,73.74,44.2,,,,,,4
3,2019-04-30 20:00:00,12.93,36.32,5.283,12.32,16.57,276.0,2019-04-30,57.7,73.84,47.01,,,,,,4
2,2019-04-30 21:00:00,12.85,35.28,8.18,12.7,17.65,274.7,2019-04-30,57.03,73.71,44.28,,,,,,4
1,2019-04-30 22:00:00,12.82,34.52,4.91,11.97,16.22,275.2,2019-04-30,56.09,73.82,44.66,,,,,,4
0,2019-04-30 23:00:00,12.8,33.07,5.962,9.23,13.33,267.0,2019-04-30,54.89,74.06,52.35,,,,,,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21303 entries, 21302 to 0
Data columns (total 17 columns):
Date/Time (PST)             21303 non-null object
Battery Voltage (v)         21296 non-null float64
Temperature (deg F)         21295 non-null float64
Wind Speed Minimum (mph)    21295 non-null float64
Wind Speed Average (mph)    21295 non-null float64
Wind Speed Maximum (mph)    21295 non-null float64
Wind Direction (deg.)       19429 non-null float64
date                        21303 non-null datetime64[ns]
24 Hour Snow (in)           17685 non-null float64
Total Snow Depth (in)       17685 non-null float64
Relative Humidity (%)       178 non-null float64
area                        19682 non-null object
danger_above_treeline       19346 non-null float64
danger_below_treeline       19346 non-null float64
danger_near_treeline        19346 non-null float64
date_tomorrow               19682 non-null object
month                       21303 non-null int64
dtypes: datetime64[ns](1

### It appears that the relative humidity column is nearly empty (only 178 columns, bummer)

In [6]:
#convert datetime back into datetime format from string
df['Date/Time (PST)'] = pd.to_datetime(df['Date/Time (PST)'])

In [7]:
df.head(1)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),Relative Humidity (%),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
21302,2014-12-17 13:00:00,0.0,12.89,25.3,0.008,0.81,125.4,2014-12-17,,,,,,,,,12


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21303 entries, 21302 to 0
Data columns (total 17 columns):
Date/Time (PST)             21303 non-null datetime64[ns]
Battery Voltage (v)         21296 non-null float64
Temperature (deg F)         21295 non-null float64
Wind Speed Minimum (mph)    21295 non-null float64
Wind Speed Average (mph)    21295 non-null float64
Wind Speed Maximum (mph)    21295 non-null float64
Wind Direction (deg.)       19429 non-null float64
date                        21303 non-null datetime64[ns]
24 Hour Snow (in)           17685 non-null float64
Total Snow Depth (in)       17685 non-null float64
Relative Humidity (%)       178 non-null float64
area                        19682 non-null object
danger_above_treeline       19346 non-null float64
danger_below_treeline       19346 non-null float64
danger_near_treeline        19346 non-null float64
date_tomorrow               19682 non-null object
month                       21303 non-null int64
dtypes: datetime

In [9]:
df = df.drop('Relative Humidity (%)',axis=1)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21303 entries, 21302 to 0
Data columns (total 16 columns):
Date/Time (PST)             21303 non-null datetime64[ns]
Battery Voltage (v)         21296 non-null float64
Temperature (deg F)         21295 non-null float64
Wind Speed Minimum (mph)    21295 non-null float64
Wind Speed Average (mph)    21295 non-null float64
Wind Speed Maximum (mph)    21295 non-null float64
Wind Direction (deg.)       19429 non-null float64
date                        21303 non-null datetime64[ns]
24 Hour Snow (in)           17685 non-null float64
Total Snow Depth (in)       17685 non-null float64
area                        19682 non-null object
danger_above_treeline       19346 non-null float64
danger_below_treeline       19346 non-null float64
danger_near_treeline        19346 non-null float64
date_tomorrow               19682 non-null object
month                       21303 non-null int64
dtypes: datetime64[ns](2), float64(11), int64(1), object(2)
memor

In [11]:
df.describe()

Unnamed: 0,Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),24 Hour Snow (in),Total Snow Depth (in),danger_above_treeline,danger_below_treeline,danger_near_treeline,month
count,21296.0,21295.0,21295.0,21295.0,21295.0,19429.0,17685.0,17685.0,19346.0,19346.0,19346.0,21303.0
mean,12.543848,26.028648,2.420495,6.437304,12.053108,179.496993,0.018083,91.41886,2.477618,1.954202,2.301458,4.368399
std,2.549749,78.319369,5.125817,6.05223,8.402773,99.822767,91.868543,29.605817,0.780573,0.775944,0.771484,3.969149
min,0.0,-7999.0,-7.743,0.0,0.0,0.0,-7999.0,-162.7,1.0,1.0,1.0,1.0
25%,12.61,21.58,0.0,2.899,7.212,86.3,0.283,73.36,2.0,1.0,2.0,2.0
50%,12.66,26.78,0.285,4.654,10.0,241.6,0.512,91.6,2.0,2.0,2.0,3.0
75%,12.74,31.87,2.877,8.09,14.25,270.7,2.425,114.0,3.0,2.0,3.0,4.0
max,45.98,68.51,53.25,84.9,71.79,359.7,166.3,220.5,4.0,4.0,4.0,12.0


In [12]:
df.shape

(21303, 16)

In [13]:
#df.clip changes all values below the number provided, gets rid of negatives in my case
#wind_index = df.loc[df['Wind Speed Minimum (mph)'] <= 0].index
#change all of these to zeros

## whats the max amount of snow ever recored in a day?

There are two contenders for this title. The most commonly accepted figure is the 75.8” of snow that fell in 24 hours at Silver Lake, Colorado on April 14-15, 1921. The other option is a 78.0” total measured at Mile 47 Camp, Alaska on February 7, 1963.Dec 20, 2016

This being said, lets cut if off at 75". rather than delete and lose info, lets assume machine was working and correct it to this cap we establishing

likewise for the lowest 24 hour snow period. Melting events can occur, so lets set this limit to a foot

In [14]:
df['24 Hour Snow (in)'] = df['24 Hour Snow (in)'].clip(upper=75)
df['24 Hour Snow (in)'] = df['24 Hour Snow (in)'].clip(lower=-12)
df['Total Snow Depth (in)'] = df['Total Snow Depth (in)'].clip(lower=0)
df['Temperature (deg F)'] = df['Temperature (deg F)'].clip(lower=-10)
df['Wind Speed Minimum (mph)'] = df['Wind Speed Minimum (mph)'].clip(lower=0)

In [15]:
df.shape

(21303, 16)

In [16]:
df.head(1)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
21302,2014-12-17 13:00:00,0.0,12.89,25.3,0.008,0.81,125.4,2014-12-17,,,,,,,,12


In [17]:
df['yesterdays_snow'] = df['24 Hour Snow (in)'].shift(-24)

In [18]:
df.iloc[100,]

Date/Time (PST)             2014-12-21 17:00:00
Battery Voltage (v)                       13.61
Temperature (deg F)                       31.07
Wind Speed Minimum (mph)                  28.14
Wind Speed Average (mph)                  45.34
Wind Speed Maximum (mph)                     10
Wind Direction (deg.)                       NaN
date                        2014-12-21 00:00:00
24 Hour Snow (in)                           NaN
Total Snow Depth (in)                       NaN
area                               Stevens Pass
danger_above_treeline                         3
danger_below_treeline                         2
danger_near_treeline                          2
date_tomorrow                 December 22, 2014
month                                        12
yesterdays_snow                             NaN
Name: 21202, dtype: object

In [19]:
## I Need to create my boundaries of possible readings before making my rolling columns
#-24 hour min/max temp

#-48 hour min/max temp

#-24/48h/72h 24 hour snow max

#-24/48 H max wind

df['max_1_day_temp']= df['Temperature (deg F)'].rolling(24).max()
df['min_1_day_temp']= df['Temperature (deg F)'].rolling(24).min()
df['max_2_day_temp']= df['Temperature (deg F)'].rolling(48).max()
df['min_2_day_temp']= df['Temperature (deg F)'].rolling(48).min()

df['max_1_day_wind']= df['Wind Speed Maximum (mph)'].rolling(24).max()
df['max_2_day_wind']= df['Wind Speed Maximum (mph)'].rolling(48).max()

df['max_1_day_snow']= df['24 Hour Snow (in)'].rolling(24).max()
df['max_2_day_snow']= df['24 Hour Snow (in)'].rolling(48).max()
df['max_3_day_snow']= df['24 Hour Snow (in)'].rolling(72).max()

In [20]:
df.shape

(21303, 26)

In [21]:
df.head(3)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),...,yesterdays_snow,max_1_day_temp,min_1_day_temp,max_2_day_temp,min_2_day_temp,max_1_day_wind,max_2_day_wind,max_1_day_snow,max_2_day_snow,max_3_day_snow
21302,2014-12-17 13:00:00,0.0,12.89,25.3,0.008,0.81,125.4,2014-12-17,,,...,,,,,,,,,,
21301,2014-12-17 14:00:00,12.79,25.97,0.004,0.101,10.0,,2014-12-17,,,...,,,,,,,,,,
21300,2014-12-17 15:00:00,12.76,25.3,0.003,0.439,10.0,,2014-12-17,,,...,,,,,,,,,,


In [22]:
df.describe() #looks a lot better!

Unnamed: 0,Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),24 Hour Snow (in),Total Snow Depth (in),danger_above_treeline,danger_below_treeline,...,yesterdays_snow,max_1_day_temp,min_1_day_temp,max_2_day_temp,min_2_day_temp,max_1_day_wind,max_2_day_wind,max_1_day_snow,max_2_day_snow,max_3_day_snow
count,21296.0,21295.0,21295.0,21295.0,21295.0,19429.0,17685.0,17685.0,19346.0,19346.0,...,17685.0,21088.0,21088.0,20872.0,20872.0,21088.0,20872.0,17478.0,17262.0,17046.0
mean,12.543848,26.780736,2.420865,6.437304,12.053108,179.496993,4.609215,91.431192,2.477618,1.954202,...,4.609215,31.771706,22.944213,34.07113,21.324983,20.6461,24.834413,14.03609,19.478129,23.689635
std,2.549749,9.194085,5.125367,6.05223,8.402773,99.822767,13.874658,29.539448,0.780573,0.775944,...,13.874658,9.829899,8.194052,9.72274,7.982558,11.564135,12.607514,20.977083,22.588715,22.962622
min,0.0,-10.0,0.0,0.0,0.0,0.0,-12.0,0.0,1.0,1.0,...,-12.0,0.066,-10.0,1.008,-10.0,0.0,0.0,-12.0,-12.0,-12.0
25%,12.61,21.58,0.0,2.899,7.212,86.3,0.283,73.36,2.0,1.0,...,0.283,25.64,18.42,27.72,16.92,11.75,14.875,0.614,1.215,2.796
50%,12.66,26.78,0.285,4.654,10.0,241.6,0.512,91.6,2.0,2.0,...,0.512,31.29,23.63,33.18,22.54,17.34,22.93,2.787,5.388,10.73
75%,12.74,31.87,2.877,8.09,14.25,270.7,2.425,114.0,3.0,2.0,...,2.425,37.53,28.13,40.14,26.34,27.16,32.405,14.11,46.49,46.49
max,45.98,68.51,53.25,84.9,71.79,359.7,75.0,220.5,4.0,4.0,...,75.0,68.51,53.96,68.51,49.48,71.79,71.79,75.0,75.0,75.0


In [23]:
df.columns

Index(['Date/Time (PST)', 'Battery Voltage (v)', 'Temperature (deg F)',
       'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
       'Wind Speed Maximum (mph)', 'Wind Direction (deg.)', 'date',
       '24 Hour Snow (in)', 'Total Snow Depth (in)', 'area',
       'danger_above_treeline', 'danger_below_treeline',
       'danger_near_treeline', 'date_tomorrow', 'month', 'yesterdays_snow',
       'max_1_day_temp', 'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
       'max_1_day_wind', 'max_2_day_wind', 'max_1_day_snow', 'max_2_day_snow',
       'max_3_day_snow'],
      dtype='object')

In [24]:
df.isna().sum()

Date/Time (PST)                0
Battery Voltage (v)            7
Temperature (deg F)            8
Wind Speed Minimum (mph)       8
Wind Speed Average (mph)       8
Wind Speed Maximum (mph)       8
Wind Direction (deg.)       1874
date                           0
24 Hour Snow (in)           3618
Total Snow Depth (in)       3618
area                        1621
danger_above_treeline       1957
danger_below_treeline       1957
danger_near_treeline        1957
date_tomorrow               1621
month                          0
yesterdays_snow             3618
max_1_day_temp               215
min_1_day_temp               215
max_2_day_temp               431
min_2_day_temp               431
max_1_day_wind               215
max_2_day_wind               431
max_1_day_snow              3825
max_2_day_snow              4041
max_3_day_snow              4257
dtype: int64

In [25]:
no_avy_index = df.loc[df['danger_above_treeline'].isna()].index
df = df.drop(index=no_avy_index,axis=0)

In [26]:
df.isna().sum()

#wind direction and 23 hour snow depth must also be removed


Date/Time (PST)                0
Battery Voltage (v)            5
Temperature (deg F)            6
Wind Speed Minimum (mph)       6
Wind Speed Average (mph)       6
Wind Speed Maximum (mph)       6
Wind Direction (deg.)       1838
date                           0
24 Hour Snow (in)           3220
Total Snow Depth (in)       3220
area                           0
danger_above_treeline          0
danger_below_treeline          0
danger_near_treeline           0
date_tomorrow                  0
month                          0
yesterdays_snow             3220
max_1_day_temp               144
min_1_day_temp               144
max_2_day_temp               300
min_2_day_temp               300
max_1_day_wind               144
max_2_day_wind               300
max_1_day_snow              3335
max_2_day_snow              3455
max_3_day_snow              3575
dtype: int64

In [27]:
no_snow_depth = df.loc[df['Total Snow Depth (in)'].isna()].index

In [28]:
df.loc[df['Total Snow Depth (in)'].isna()].shape[0]

3220

In [29]:
df.shape[0] - df.drop(index=no_snow_depth).shape[0]

3220

In [30]:
df = df.drop(index=no_snow_depth)

In [31]:
df.isna().sum()


Date/Time (PST)               0
Battery Voltage (v)           0
Temperature (deg F)           1
Wind Speed Minimum (mph)      1
Wind Speed Average (mph)      1
Wind Speed Maximum (mph)      1
Wind Direction (deg.)         1
date                          0
24 Hour Snow (in)             0
Total Snow Depth (in)         0
area                          0
danger_above_treeline         0
danger_below_treeline         0
danger_near_treeline          0
date_tomorrow                 0
month                         0
yesterdays_snow               5
max_1_day_temp              139
min_1_day_temp              139
max_2_day_temp              283
min_2_day_temp              283
max_1_day_wind              139
max_2_day_wind              283
max_1_day_snow              115
max_2_day_snow              235
max_3_day_snow              355
dtype: int64

In [32]:
df_clean = df.dropna()

## Now, what do i want my X's to be?? 

X: temp, wind speed average, 24 hour snow (in), max/min 24 and 48 H temp 

Y: danger_near_treeline

In [33]:
df_clean = df_clean.reset_index(drop=True)

In [34]:
df_clean.columns

Index(['Date/Time (PST)', 'Battery Voltage (v)', 'Temperature (deg F)',
       'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
       'Wind Speed Maximum (mph)', 'Wind Direction (deg.)', 'date',
       '24 Hour Snow (in)', 'Total Snow Depth (in)', 'area',
       'danger_above_treeline', 'danger_below_treeline',
       'danger_near_treeline', 'date_tomorrow', 'month', 'yesterdays_snow',
       'max_1_day_temp', 'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
       'max_1_day_wind', 'max_2_day_wind', 'max_1_day_snow', 'max_2_day_snow',
       'max_3_day_snow'],
      dtype='object')

In [35]:
df_clean.shape[0]*.80


12574.400000000001

In [36]:
X = df_clean[[
            'Battery Voltage (v)', 'Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            '24 Hour Snow (in)', 'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow', 'max_2_day_snow', 'max_3_day_snow']]

y = df_clean['danger_near_treeline']

In [37]:
X_train, X_test, y_train, y_test = X[:12578], X[12578:], y[:12578], y[12578:]

DO A CHRONOLIGCAL SPLIT, because its a time series problem


In [38]:
X_train.isna().sum()

Battery Voltage (v)         0
Temperature (deg F)         0
Wind Speed Minimum (mph)    0
Wind Speed Average (mph)    0
Wind Speed Maximum (mph)    0
Wind Direction (deg.)       0
24 Hour Snow (in)           0
Total Snow Depth (in)       0
max_1_day_temp              0
min_1_day_temp              0
max_2_day_temp              0
min_2_day_temp              0
max_1_day_snow              0
max_2_day_snow              0
max_3_day_snow              0
dtype: int64

In [39]:
ss = StandardScaler()

In [40]:
mapper = DataFrameMapper([
    ([   'Battery Voltage (v)', 'Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            '24 Hour Snow (in)', 'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow', 'max_2_day_snow', 'max_3_day_snow'], ss)
    ])

In [41]:
lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=100)
gbr = GradientBoostingRegressor(n_estimators=100)
knnr = KNeighborsRegressor()
xgbr = XGBRegressor(objective='reg:squarederror')
xgbrfr = XGBRFRegressor(objective='reg:squarederror')
regressors = [lr, rfr, gbr, knnr, xgbr,xgbrfr ]

In [42]:
reg_names = ['LinearRegression',
             'RandomForestRegressor',
             'GradientBoostingRegressor',
             'KNeighborsRegressor',
             'XGBRegressor',
             'XGBRFRegressor'
            ]

In [43]:
baseline_pred

NameError: name 'baseline_pred' is not defined

In [44]:
baseline_pred = np.mean(y_train)
residuals = (y_test - baseline_pred)
median_absolute_error = np.median(np.abs(residuals))
rmse=np.sqrt(np.mean(residuals**2))

print(median_absolute_error,rmse)

print(f'The MAE of just predicting the mean'
          f' is: {median_absolute_error}\n'
          f'The RMSE just predicting the mean'
          f' is: {rmse}\n'
      '==================================================================\n'
          '========================\n')

for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        ('scaler',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error((y_test),preds)}\n'
          f'The RMSE of the {name}'
          f' is: {mean_squared_error((y_test),preds)}\n'
          '==================================================================\n'
          '========================\n'
         )
    


0.5498489425981874 0.8716723262153395
The MAE of just predicting the mean is: 0.5498489425981874
The RMSE just predicting the mean is: 0.8716723262153395

The MAE of the LinearRegression is: 0.6996172319335131
The RMSE of the LinearRegression is: 0.661252359501343

The MAE of the RandomForestRegressor is: 0.6728152335456475
The RMSE of the RandomForestRegressor is: 0.6356156918942064

The MAE of the GradientBoostingRegressor is: 0.6519846811737089
The RMSE of the GradientBoostingRegressor is: 0.5942823061374262

The MAE of the KNeighborsRegressor is: 0.7730573248407645
The RMSE of the KNeighborsRegressor is: 0.90228025477707



  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


The MAE of the XGBRegressor is: 0.6547812147504964
The RMSE of the XGBRegressor is: 0.6033406052152962



  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


The MAE of the XGBRFRegressor is: 0.6771474563392105
The RMSE of the XGBRFRegressor is: 0.6101870582111678



In [45]:
pipe2 = Pipeline(steps=[
    ('scaler',mapper),
    ('regressor', gbr)
])
pipe2.fit(X_train, y_train)
preds = pipe2.predict(X_test)
print(f'The MAE of the {name}'
      f' is: {mean_absolute_error((y_test),preds)}\n'
      f'The RMSE of the {name}'
      f' is: {mean_squared_error((y_test),preds)}\n'
      '==================================================================\n'
      '========================\n'
     )



The MAE of the XGBRFRegressor is: 0.6519891545823308
The RMSE of the XGBRFRegressor is: 0.5942359504543339



In [46]:
y_test[0:20]

12578    2.0
12579    2.0
12580    2.0
12581    2.0
12582    2.0
12583    2.0
12584    2.0
12585    2.0
12586    2.0
12587    2.0
12588    2.0
12589    2.0
12590    2.0
12591    2.0
12592    2.0
12593    2.0
12594    2.0
12595    2.0
12596    2.0
12597    2.0
Name: danger_near_treeline, dtype: float64

In [47]:
pipe2.predict(X_test)[0:20]

array([2.8394024 , 2.59150781, 2.73229977, 2.65358108, 2.78926223,
       2.79920076, 2.68900234, 2.62609734, 2.6590719 , 2.6784285 ,
       2.8030285 , 2.8030285 , 2.77631418, 2.72157156, 2.72167116,
       2.73400301, 2.73400301, 2.71819671, 2.72169891, 2.70589261])

## Model 4 
Gradient Boosting RMSE is better than guessing the mean!

Also my model is predicting hourly avalanche forecasts, which is super granular. 

The forecasts are given daily, and thus my Y im predicting and basing my scoring on isn't as granular.

In [48]:
bar_p = pd.read_csv('csv_files/stevens_pass_barometric_pressure_2014_now.csv')

FileNotFoundError: File b'csv_files/stevens_pass_barometric_pressure_2014_now.csv' does not exist

In [49]:
bar_p.head()

NameError: name 'bar_p' is not defined

In [50]:
bar_p.describe()

NameError: name 'bar_p' is not defined

In [51]:
bar_p.info()

NameError: name 'bar_p' is not defined

In [52]:
bar_p['Date/Time (PST)'] = pd.to_datetime(bar_p["Date/Time (PST)"])

NameError: name 'bar_p' is not defined

In [53]:
bar_p.info()

NameError: name 'bar_p' is not defined

In [54]:
bar_p = bar_p[["Date/Time (PST)","mb - 3950\' - Stevens Pass - Schmidt Haus"]]

NameError: name 'bar_p' is not defined

In [55]:
df_merge = pd.merge(df_clean,bar_p,how='left',on='Date/Time (PST)')

NameError: name 'bar_p' is not defined

In [56]:
df_merge = df_merge.dropna()

NameError: name 'df_merge' is not defined

In [57]:
df_merge.info()

NameError: name 'df_merge' is not defined

In [58]:
precip = pd.read_csv('csv_files/stevens_pass_precipitation_2014_now.csv')
precip['Date/Time (PST)'] = pd.to_datetime(precip["Date/Time (PST)"])

FileNotFoundError: File b'csv_files/stevens_pass_precipitation_2014_now.csv' does not exist

In [59]:
precip.columns = ['Date/Time (PST)', '2700ft_Berne','3950ft_SchmidtHaus','4800ft_Brooks_precip']

NameError: name 'precip' is not defined

In [60]:
precip.head()

NameError: name 'precip' is not defined

In [61]:
precip.columns

NameError: name 'precip' is not defined

In [62]:
precip_brooks = precip[['Date/Time (PST)','4800ft_Brooks_precip']]

NameError: name 'precip' is not defined

In [63]:
shot_snow = pd.read_csv('csv_files/stevens-pass_intermittent_snow.csv')
shot_snow.head()

FileNotFoundError: File b'csv_files/stevens-pass_intermittent_snow.csv' does not exist

In [64]:
shot_snow["Date/Time (PST)"] = pd.to_datetime(shot_snow["Date/Time (PST)"])

NameError: name 'shot_snow' is not defined

In [65]:
df_merge2 = pd.merge(df_merge,shot_snow,on="Date/Time (PST)")

NameError: name 'df_merge' is not defined

In [66]:
merge_final = pd.merge(df_merge2,precip_brooks,on="Date/Time (PST)")

NameError: name 'df_merge2' is not defined

In [67]:
merge_final.isna().sum()

NameError: name 'merge_final' is not defined

In [68]:
merge_final.describe()

NameError: name 'merge_final' is not defined

In [69]:
merge_final.columns

NameError: name 'merge_final' is not defined

In [70]:
merge_final['24 Hour Snow (in)'].plot();

NameError: name 'merge_final' is not defined

In [71]:
merge_final.info()

NameError: name 'merge_final' is not defined

In [72]:
mapper = DataFrameMapper([
        ([   
            'Battery Voltage (v)', 'Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            '24 Hour Snow (in)', 'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow', 'max_2_day_snow', 'max_3_day_snow',
            "mb - 3950' - Stevens Pass - Schmidt Haus",
            '4800ft_Brooks_precip'], ss)
    ])

In [73]:
X = merge_final[[
        'Battery Voltage (v)', 'Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            '24 Hour Snow (in)', 'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow', 'max_2_day_snow', 'max_3_day_snow',
            "mb - 3950' - Stevens Pass - Schmidt Haus",
            '4800ft_Brooks_precip']]

y = merge_final['danger_near_treeline']

NameError: name 'merge_final' is not defined

In [74]:
X_train, X_test, y_train, y_test = X[:12553], X[12553:], y[:12553], y[12553:]

In [75]:
pipe3 = Pipeline(steps=[
    ('scaler',mapper),
    ('regressor', gbr)
])
pipe3.fit(X_train, y_train)
preds = pipe3.predict(X_test)
print(f'The MAE of the {name}'
      f' is: {mean_absolute_error((y_test),preds)}\n'
      f'The RMSE of the {name}'
      f' is: {mean_squared_error((y_test),preds)}\n'
      '==================================================================\n'
      '========================\n'
     )


KeyError: '["mb - 3950\' - Stevens Pass - Schmidt Haus" \'4800ft_Brooks_precip\'] not in index'

## Conclusion
-adding precipitation and the barometric pressure makes my model worse, ha