In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error

In [2]:
df = pd.read_json('stevens_data_master.json')
df.head()

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),Relative Humidity (%),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
0,2019-04-30 23:00:00,12.8,33.07,5.962,9.23,13.33,267.0,2019-04-30,54.89,74.06,52.35,,,,,,4
1,2019-04-30 22:00:00,12.82,34.52,4.91,11.97,16.22,275.2,2019-04-30,56.09,73.82,44.66,,,,,,4
10,2019-04-30 13:00:00,12.67,41.23,0.0,4.946,18.17,292.8,2019-04-30,62.73,73.3,24.97,,,,,,4
100,2019-04-26 19:00:00,12.69,34.47,1.052,15.43,41.23,296.0,2019-04-26,67.57,73.6,82.4,,,,,,4
1000,2019-03-24 07:00:00,12.74,29.79,5.612,8.28,11.81,274.0,2019-03-24,0.491,90.0,,Stevens Pass,1.0,1.0,1.0,"March 25, 2019",3


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21303 entries, 0 to 9999
Data columns (total 17 columns):
Date/Time (PST)             21303 non-null object
Battery Voltage (v)         21296 non-null float64
Temperature (deg F)         21295 non-null float64
Wind Speed Minimum (mph)    21295 non-null float64
Wind Speed Average (mph)    21295 non-null float64
Wind Speed Maximum (mph)    21295 non-null float64
Wind Direction (deg.)       19429 non-null float64
date                        21303 non-null datetime64[ns]
24 Hour Snow (in)           17685 non-null float64
Total Snow Depth (in)       17685 non-null float64
Relative Humidity (%)       178 non-null float64
area                        19682 non-null object
danger_above_treeline       19346 non-null float64
danger_below_treeline       19346 non-null float64
danger_near_treeline        19346 non-null float64
date_tomorrow               19682 non-null object
month                       21303 non-null int64
dtypes: datetime64[ns](1)

### It appears that the relative humidity column is nearly empty (only 178 columns, bummer)

In [4]:
#convert datetime back into datetime format from string
df['Date/Time (PST)'] = pd.to_datetime(df['Date/Time (PST)'])

In [5]:
df.head(1)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),Relative Humidity (%),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
0,2019-04-30 23:00:00,12.8,33.07,5.962,9.23,13.33,267.0,2019-04-30,54.89,74.06,52.35,,,,,,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21303 entries, 0 to 9999
Data columns (total 17 columns):
Date/Time (PST)             21303 non-null datetime64[ns]
Battery Voltage (v)         21296 non-null float64
Temperature (deg F)         21295 non-null float64
Wind Speed Minimum (mph)    21295 non-null float64
Wind Speed Average (mph)    21295 non-null float64
Wind Speed Maximum (mph)    21295 non-null float64
Wind Direction (deg.)       19429 non-null float64
date                        21303 non-null datetime64[ns]
24 Hour Snow (in)           17685 non-null float64
Total Snow Depth (in)       17685 non-null float64
Relative Humidity (%)       178 non-null float64
area                        19682 non-null object
danger_above_treeline       19346 non-null float64
danger_below_treeline       19346 non-null float64
danger_near_treeline        19346 non-null float64
date_tomorrow               19682 non-null object
month                       21303 non-null int64
dtypes: datetime6

In [7]:
df = df.drop('Relative Humidity (%)',axis=1)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21303 entries, 0 to 9999
Data columns (total 16 columns):
Date/Time (PST)             21303 non-null datetime64[ns]
Battery Voltage (v)         21296 non-null float64
Temperature (deg F)         21295 non-null float64
Wind Speed Minimum (mph)    21295 non-null float64
Wind Speed Average (mph)    21295 non-null float64
Wind Speed Maximum (mph)    21295 non-null float64
Wind Direction (deg.)       19429 non-null float64
date                        21303 non-null datetime64[ns]
24 Hour Snow (in)           17685 non-null float64
Total Snow Depth (in)       17685 non-null float64
area                        19682 non-null object
danger_above_treeline       19346 non-null float64
danger_below_treeline       19346 non-null float64
danger_near_treeline        19346 non-null float64
date_tomorrow               19682 non-null object
month                       21303 non-null int64
dtypes: datetime64[ns](2), float64(11), int64(1), object(2)
memory

In [9]:
df.describe()

Unnamed: 0,Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),24 Hour Snow (in),Total Snow Depth (in),danger_above_treeline,danger_below_treeline,danger_near_treeline,month
count,21296.0,21295.0,21295.0,21295.0,21295.0,19429.0,17685.0,17685.0,19346.0,19346.0,19346.0,21303.0
mean,12.543848,26.028648,2.420495,6.437304,12.053108,179.496993,0.018083,91.41886,2.477618,1.954202,2.301458,4.368399
std,2.549749,78.319369,5.125817,6.05223,8.402773,99.822767,91.868543,29.605817,0.780573,0.775944,0.771484,3.969149
min,0.0,-7999.0,-7.743,0.0,0.0,0.0,-7999.0,-162.7,1.0,1.0,1.0,1.0
25%,12.61,21.58,0.0,2.899,7.212,86.3,0.283,73.36,2.0,1.0,2.0,2.0
50%,12.66,26.78,0.285,4.654,10.0,241.6,0.512,91.6,2.0,2.0,2.0,3.0
75%,12.74,31.87,2.877,8.09,14.25,270.7,2.425,114.0,3.0,2.0,3.0,4.0
max,45.98,68.51,53.25,84.9,71.79,359.7,166.3,220.5,4.0,4.0,4.0,12.0


## WHAT DO TO WITH NEGATIVE VALUES?

Temp: set -10 deg F as a cutofff, for it covers all possible temps the cascades can experience with wind chill

Wind Speed min: change all negative values to 0

24 hour snow: this is a hard one, for snow can blow off after a windstorm or melting period/settling. Let's assume that anything over a foot loss in 12 hours is false. This is a naive assumption. 

Total Snow Depth: there are two values less than zero, so i wil just drop those rows

In [10]:
df.shape

(21303, 16)

In [11]:
#wind_index = df.loc[df['Wind Speed Minimum (mph)'] <= 0].index
#change all of these to zeros

df['Wind Speed Minimum (mph)'] = df['Wind Speed Minimum (mph)'].clip(lower=0)

In [12]:
dropped_24snow_index = df.loc[df['24 Hour Snow (in)'] <= -12 ].index
#only losing two rows

df.loc[df['24 Hour Snow (in)'] <= -12 ].shape

(482, 16)

In [13]:
df.shape[0]- df.drop(index=dropped_24snow_index,axis=0).shape[0]

482

In [14]:
df = df.drop(index=dropped_24snow_index,axis=0)

In [15]:
df.shape

(20821, 16)

In [16]:
dropped_totalsnow_index= df.loc[df['Total Snow Depth (in)'] <= 0].index
#only losing two rows


In [17]:
df = df.drop(index=dropped_totalsnow_index,axis=0)

In [18]:
dropped_temp_index = df.loc[df['Temperature (deg F)'] <= -10].index

#-10 Farenheit is cold, so let's use that as a cutoff
#only losing three rows

In [19]:
df = df.drop(index=dropped_temp_index,axis=0)


In [20]:
df.shape

(20816, 16)

In [21]:
df.head(3)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
0,2019-04-30 23:00:00,12.8,33.07,5.962,9.23,13.33,267.0,2019-04-30,54.89,74.06,,,,,,4
1,2019-04-30 22:00:00,12.82,34.52,4.91,11.97,16.22,275.2,2019-04-30,56.09,73.82,,,,,,4
10,2019-04-30 13:00:00,12.67,41.23,0.0,4.946,18.17,292.8,2019-04-30,62.73,73.3,,,,,,4


In [22]:
df.describe()

Unnamed: 0,Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),24 Hour Snow (in),Total Snow Depth (in),danger_above_treeline,danger_below_treeline,danger_near_treeline,month
count,20809.0,20808.0,20808.0,20808.0,20808.0,18945.0,17201.0,17201.0,19063.0,19063.0,19063.0,20816.0
mean,12.539524,26.746334,2.437918,6.423208,12.014106,178.791021,5.478905,91.369089,2.474322,1.95048,2.298956,4.38456
std,2.578988,9.202646,5.172718,6.07332,8.403344,99.929033,16.288344,29.626413,0.780668,0.775005,0.770728,3.997433
min,0.0,-2.007,0.0,0.0,0.0,0.0,-9.82,1.722,1.0,1.0,1.0,1.0
25%,12.61,21.53,0.0,2.88775,7.19,86.2,0.294,73.48,2.0,1.0,2.0,2.0
50%,12.66,26.73,0.279,4.6255,10.0,239.5,0.52,91.5,2.0,2.0,2.0,3.0
75%,12.74,31.86,2.8805,8.0425,14.07,270.6,2.527,113.9,3.0,2.0,3.0,4.0
max,45.98,68.51,53.25,84.9,71.79,359.7,166.3,220.5,4.0,4.0,4.0,12.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20816 entries, 0 to 9999
Data columns (total 16 columns):
Date/Time (PST)             20816 non-null datetime64[ns]
Battery Voltage (v)         20809 non-null float64
Temperature (deg F)         20808 non-null float64
Wind Speed Minimum (mph)    20808 non-null float64
Wind Speed Average (mph)    20808 non-null float64
Wind Speed Maximum (mph)    20808 non-null float64
Wind Direction (deg.)       18945 non-null float64
date                        20816 non-null datetime64[ns]
24 Hour Snow (in)           17201 non-null float64
Total Snow Depth (in)       17201 non-null float64
area                        19358 non-null object
danger_above_treeline       19063 non-null float64
danger_below_treeline       19063 non-null float64
danger_near_treeline        19063 non-null float64
date_tomorrow               19358 non-null object
month                       20816 non-null int64
dtypes: datetime64[ns](2), float64(11), int64(1), object(2)
memory

In [24]:
df.head(1)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month
0,2019-04-30 23:00:00,12.8,33.07,5.962,9.23,13.33,267.0,2019-04-30,54.89,74.06,,,,,,4


In [25]:
df = df.sort_values(by='Date/Time (PST)',ascending=True)

In [26]:
df['max_24_hour_temp']= df['Temperature (deg F)'].rolling(24).max()
df['min_24_hour_temp']= df['Temperature (deg F)'].rolling(24).min()

In [27]:
df.tail(10)

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month,max_24_hour_temp,min_24_hour_temp
9,2019-04-30 14:00:00,12.66,39.14,0.0,3.424,23.65,292.0,2019-04-30,61.79,73.43,,,,,,4,46.16,30.03
8,2019-04-30 15:00:00,12.65,41.2,0.044,7.277,19.29,283.4,2019-04-30,63.0,72.87,,,,,,4,46.16,30.03
7,2019-04-30 16:00:00,12.64,38.55,0.482,8.72,19.79,284.7,2019-04-30,61.66,72.85,,,,,,4,46.16,30.03
6,2019-04-30 17:00:00,12.64,38.35,0.0,5.536,12.43,278.6,2019-04-30,59.24,73.22,,,,,,4,46.16,30.03
5,2019-04-30 18:00:00,13.52,38.46,0.0,8.7,21.96,287.2,2019-04-30,60.25,73.48,,,,,,4,46.16,30.03
4,2019-04-30 19:00:00,13.58,37.24,0.088,9.15,20.58,283.8,2019-04-30,58.71,73.74,,,,,,4,46.16,30.03
3,2019-04-30 20:00:00,12.93,36.32,5.283,12.32,16.57,276.0,2019-04-30,57.7,73.84,,,,,,4,46.16,30.12
2,2019-04-30 21:00:00,12.85,35.28,8.18,12.7,17.65,274.7,2019-04-30,57.03,73.71,,,,,,4,46.16,30.67
1,2019-04-30 22:00:00,12.82,34.52,4.91,11.97,16.22,275.2,2019-04-30,56.09,73.82,,,,,,4,46.16,30.67
0,2019-04-30 23:00:00,12.8,33.07,5.962,9.23,13.33,267.0,2019-04-30,54.89,74.06,,,,,,4,46.16,30.67


In [28]:
df['max_48_hour_temp']= df['Temperature (deg F)'].rolling(24).max()
df['min_48_hour_temp']= df['Temperature (deg F)'].rolling(24).min()

In [29]:
df.iloc[[50]]

Unnamed: 0,Date/Time (PST),Battery Voltage (v),Temperature (deg F),Wind Speed Minimum (mph),Wind Speed Average (mph),Wind Speed Maximum (mph),Wind Direction (deg.),date,24 Hour Snow (in),Total Snow Depth (in),area,danger_above_treeline,danger_below_treeline,danger_near_treeline,date_tomorrow,month,max_24_hour_temp,min_24_hour_temp,max_48_hour_temp,min_48_hour_temp
21252,2014-12-19 15:00:00,13.13,28.66,0.005,0.573,10.0,,2014-12-19,,,Stevens Pass,3.0,2.0,2.0,"December 20, 2014",12,29.5,23.61,29.5,23.61


In [30]:
df.columns

Index(['Date/Time (PST)', 'Battery Voltage (v)', 'Temperature (deg F)',
       'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
       'Wind Speed Maximum (mph)', 'Wind Direction (deg.)', 'date',
       '24 Hour Snow (in)', 'Total Snow Depth (in)', 'area',
       'danger_above_treeline', 'danger_below_treeline',
       'danger_near_treeline', 'date_tomorrow', 'month', 'max_24_hour_temp',
       'min_24_hour_temp', 'max_48_hour_temp', 'min_48_hour_temp'],
      dtype='object')

In [31]:
df.isna().sum()

Date/Time (PST)                0
Battery Voltage (v)            7
Temperature (deg F)            8
Wind Speed Minimum (mph)       8
Wind Speed Average (mph)       8
Wind Speed Maximum (mph)       8
Wind Direction (deg.)       1871
date                           0
24 Hour Snow (in)           3615
Total Snow Depth (in)       3615
area                        1458
danger_above_treeline       1753
danger_below_treeline       1753
danger_near_treeline        1753
date_tomorrow               1458
month                          0
max_24_hour_temp             215
min_24_hour_temp             215
max_48_hour_temp             215
min_48_hour_temp             215
dtype: int64

In [32]:
no_avy_index = df.loc[df['danger_above_treeline'].isna()].index
df = df.drop(index=no_avy_index,axis=0)

In [33]:
df.isna().sum()

#wind direction and 23 hour snow depth must also be removed


Date/Time (PST)                0
Battery Voltage (v)            5
Temperature (deg F)            6
Wind Speed Minimum (mph)       6
Wind Speed Average (mph)       6
Wind Speed Maximum (mph)       6
Wind Direction (deg.)       1835
date                           0
24 Hour Snow (in)           3217
Total Snow Depth (in)       3217
area                           0
danger_above_treeline          0
danger_below_treeline          0
danger_near_treeline           0
date_tomorrow                  0
month                          0
max_24_hour_temp             159
min_24_hour_temp             159
max_48_hour_temp             159
min_48_hour_temp             159
dtype: int64

In [34]:
no_snow_depth = df.loc[df['Total Snow Depth (in)'].isna()].index

In [35]:
df.loc[df['Total Snow Depth (in)'].isna()].shape[0]

3217

In [36]:
df.shape[0] - df.drop(index=no_snow_depth).shape[0]

3217

In [37]:
df = df.drop(index=no_snow_depth)

In [38]:
df.isna().sum()


Date/Time (PST)               0
Battery Voltage (v)           0
Temperature (deg F)           1
Wind Speed Minimum (mph)      1
Wind Speed Average (mph)      1
Wind Speed Maximum (mph)      1
Wind Direction (deg.)         1
date                          0
24 Hour Snow (in)             0
Total Snow Depth (in)         0
area                          0
danger_above_treeline         0
danger_below_treeline         0
danger_near_treeline          0
date_tomorrow                 0
month                         0
max_24_hour_temp            154
min_24_hour_temp            154
max_48_hour_temp            154
min_48_hour_temp            154
dtype: int64

In [39]:
df_clean = df.dropna()

## Now, what do i want my X's to be?? 

X: temp, wind speed average, 24 hour snow (in), max/min 24 and 48 H temp 

Y: danger_near_treeline

In [40]:
df_clean = df_clean.reset_index(drop=True)

In [41]:
df_clean.columns

Index(['Date/Time (PST)', 'Battery Voltage (v)', 'Temperature (deg F)',
       'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
       'Wind Speed Maximum (mph)', 'Wind Direction (deg.)', 'date',
       '24 Hour Snow (in)', 'Total Snow Depth (in)', 'area',
       'danger_above_treeline', 'danger_below_treeline',
       'danger_near_treeline', 'date_tomorrow', 'month', 'max_24_hour_temp',
       'min_24_hour_temp', 'max_48_hour_temp', 'min_48_hour_temp'],
      dtype='object')

In [42]:
df_clean.shape[0]*.80


12553.6

In [43]:
X = df_clean[[
        'Battery Voltage (v)',
        'Temperature (deg F)',
        'Wind Speed Minimum (mph)', 
        'Wind Speed Average (mph)',
        'Wind Speed Maximum (mph)', 
        '24 Hour Snow (in)',
        'Total Snow Depth (in)',
        'max_24_hour_temp', 'min_24_hour_temp',
        'max_48_hour_temp', 'min_48_hour_temp']]

y = df_clean['danger_near_treeline']

In [44]:
X_train, X_test, y_train, y_test = X[:12553], X[12553:], y[:12553], y[12553:]

DO A CHRONOLIGCAL SPLIT, because its a time series problem


In [45]:
X_train.isna().sum()

Battery Voltage (v)         0
Temperature (deg F)         0
Wind Speed Minimum (mph)    0
Wind Speed Average (mph)    0
Wind Speed Maximum (mph)    0
24 Hour Snow (in)           0
Total Snow Depth (in)       0
max_24_hour_temp            0
min_24_hour_temp            0
max_48_hour_temp            0
min_48_hour_temp            0
dtype: int64

In [46]:
ss = StandardScaler()

In [47]:
mapper = DataFrameMapper([
    (['Temperature (deg F)', 
        'Wind Speed Average (mph)',
        'Wind Speed Maximum (mph)', 
        '24 Hour Snow (in)',
        'Total Snow Depth (in)',
        'max_24_hour_temp', 'min_24_hour_temp',
        'max_48_hour_temp', 'min_48_hour_temp'], ss)
    ])

In [48]:
lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=100)
gbr = GradientBoostingRegressor(n_estimators=100)
knnr = KNeighborsRegressor()
xgbr = XGBRegressor(objective='reg:squarederror')
xgbrfr = XGBRFRegressor(objective='reg:squarederror')
regressors = [lr, rfr, gbr, knnr, xgbr,xgbrfr ]

In [49]:
reg_names = ['LinearRegression',
             'RandomForestRegressor',
             'GradientBoostingRegressor',
             'KNeighborsRegressor',
             'XGBRegressor',
             'XGBRFRegressor'
            ]

In [50]:
baseline_pred = np.mean(y_train)
residuals = (y_test - baseline_pred)
median_absolute_error = np.median(np.abs(residuals))
rmse=np.sqrt(np.mean(residuals**2))

print(median_absolute_error,rmse)

print(f'The MAE of just predicting the mean'
          f' is: {median_absolute_error}\n'
          f'The RMSE just predicting the mean'
          f' is: {rmse}\n'
      '==================================================================\n'
          '========================\n')

for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        ('scaler',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error((y_test),preds)}\n'
          f'The RMSE of the {name}'
          f' is: {mean_squared_error((y_test),preds)}\n'
          '==================================================================\n'
          '========================\n'
         )
    


0.5528558910220664 0.8704630100921337
The MAE of just predicting the mean is: 0.5528558910220664
The RMSE just predicting the mean is: 0.8704630100921337

The MAE of the LinearRegression is: 0.721349558499491
The RMSE of the LinearRegression is: 0.6985482364779043

The MAE of the RandomForestRegressor is: 0.7453909925955959
The RMSE of the RandomForestRegressor is: 0.7918044790946913

The MAE of the GradientBoostingRegressor is: 0.6905870858669245
The RMSE of the GradientBoostingRegressor is: 0.6658530381827482

The MAE of the KNeighborsRegressor is: 0.751322077094616
The RMSE of the KNeighborsRegressor is: 0.8627460974832749



  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


The MAE of the XGBRegressor is: 0.683367763480095
The RMSE of the XGBRegressor is: 0.6484898935847316



  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


The MAE of the XGBRFRegressor is: 0.6874787549981649
The RMSE of the XGBRFRegressor is: 0.6315169737263702



In [51]:
y_test[0:20]

12553    2.0
12554    2.0
12555    2.0
12556    2.0
12557    2.0
12558    2.0
12559    2.0
12560    2.0
12561    2.0
12562    2.0
12563    2.0
12564    2.0
12565    2.0
12566    2.0
12567    2.0
12568    2.0
12569    2.0
12570    2.0
12571    2.0
12572    2.0
Name: danger_near_treeline, dtype: float64

In [52]:
pipe.predict(X_test)[0:20]

array([2.2995653, 2.3104956, 2.2995653, 2.6982124, 2.7104504, 2.731259 ,
       2.7102377, 2.7227201, 2.736037 , 2.736037 , 2.736037 , 2.731259 ,
       2.7259722, 2.707441 , 2.7080333, 2.7080333, 2.7104504, 2.6982124,
       2.6982124, 2.699021 ], dtype=float32)

## Model 2 
No regression in my pipeline peforms better than assuming the mean :/

## What can I add to supplement my model?
- Add more lag features. 
    Some avalanche causes take weeks to resolve, so need to go back to help my model reach that 
- take in oncoming weather data
- use barametric pressure?

In [53]:
bar_p = pd.read_csv('csv_files/stevens_pass_barometric_pressure_2014_now.csv')

In [54]:
bar_p.head()

Unnamed: 0,Date/Time (PST),mb - 2700' - Berne,mb - 3950' - Stevens Pass - Schmidt Haus
0,2019-06-28 12:00,1017.0,1018.0
1,2019-06-28 11:00,1016.7,1018.0
2,2019-06-28 10:00,1016.6,1019.0
3,2019-06-28 09:00,1016.9,1018.0
4,2019-06-28 08:00,1017.1,1018.0


In [55]:
bar_p.describe()

Unnamed: 0,mb - 2700' - Berne,mb - 3950' - Stevens Pass - Schmidt Haus
count,39901.0,40818.0
mean,1014.58139,1015.308593
std,6.507781,6.392107
min,986.31,987.0
25%,1011.3,1012.0
50%,1015.1,1016.0
75%,1018.5,1019.0
max,1037.3,1036.0


In [56]:
bar_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41700 entries, 0 to 41699
Data columns (total 3 columns):
Date/Time (PST)                             41700 non-null object
mb - 2700' - Berne                          39901 non-null float64
mb - 3950' - Stevens Pass - Schmidt Haus    40818 non-null float64
dtypes: float64(2), object(1)
memory usage: 977.4+ KB


In [57]:
bar_p['Date/Time (PST)'] = pd.to_datetime(bar_p["Date/Time (PST)"])

In [58]:
bar_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41700 entries, 0 to 41699
Data columns (total 3 columns):
Date/Time (PST)                             41700 non-null datetime64[ns]
mb - 2700' - Berne                          39901 non-null float64
mb - 3950' - Stevens Pass - Schmidt Haus    40818 non-null float64
dtypes: datetime64[ns](1), float64(2)
memory usage: 977.4 KB


In [59]:
bar_p = bar_p[["Date/Time (PST)","mb - 3950\' - Stevens Pass - Schmidt Haus"]]

In [60]:
df_merge = pd.merge(df_clean,bar_p,how='left',on='Date/Time (PST)')

In [61]:
df_merge = df_merge.dropna()

In [62]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15686 entries, 0 to 15691
Data columns (total 21 columns):
Date/Time (PST)                             15686 non-null datetime64[ns]
Battery Voltage (v)                         15686 non-null float64
Temperature (deg F)                         15686 non-null float64
Wind Speed Minimum (mph)                    15686 non-null float64
Wind Speed Average (mph)                    15686 non-null float64
Wind Speed Maximum (mph)                    15686 non-null float64
Wind Direction (deg.)                       15686 non-null float64
date                                        15686 non-null datetime64[ns]
24 Hour Snow (in)                           15686 non-null float64
Total Snow Depth (in)                       15686 non-null float64
area                                        15686 non-null object
danger_above_treeline                       15686 non-null float64
danger_below_treeline                       15686 non-null float64
danger_

In [63]:
precip = pd.read_csv('csv_files/stevens_pass_precipitation_2014_now.csv')
precip['Date/Time (PST)'] = pd.to_datetime(precip["Date/Time (PST)"])

In [64]:
precip.columns = ['Date/Time (PST)', '2700ft_Berne','3950ft_SchmidtHaus','4800ft_Brooks_precip']

In [65]:
precip.head()

Unnamed: 0,Date/Time (PST),2700ft_Berne,3950ft_SchmidtHaus,4800ft_Brooks_precip
0,2019-06-28 12:00:00,0.0,0.0,
1,2019-06-28 11:00:00,0.0,0.0,
2,2019-06-28 10:00:00,0.0,0.0,
3,2019-06-28 09:00:00,0.0,0.0,
4,2019-06-28 08:00:00,0.0,0.0,


In [66]:
precip.columns

Index(['Date/Time (PST)', '2700ft_Berne', '3950ft_SchmidtHaus',
       '4800ft_Brooks_precip'],
      dtype='object')

In [67]:
precip_brooks = precip[['Date/Time (PST)','4800ft_Brooks_precip']]

In [68]:
shot_snow = pd.read_csv('csv_files/stevens-pass_intermittent_snow.csv')
shot_snow.head()

Unnamed: 0,Date/Time (PST),""" - 4790' - Stevens Pass - Grace Lakes"
0,2019-06-28 12:00,-7999.0
1,2019-06-28 11:00,-7999.0
2,2019-06-28 10:00,-7999.0
3,2019-06-28 09:00,-7999.0
4,2019-06-28 08:00,-7999.0


In [69]:
shot_snow["Date/Time (PST)"] = pd.to_datetime(shot_snow["Date/Time (PST)"])

In [70]:
pd.merge(merge_final,shot_snow,on="Date/Time (PST)").info()

NameError: name 'merge_final' is not defined

In [None]:
merge_final = pd.merge(df_merge,precip_brooks,on="Date/Time (PST)")

In [None]:
merge_final.isna().sum()

In [None]:
merge_final.describe()

In [None]:
merge_final['24 Hour Snow (in)'].plot();

In [None]:
merge_final.info()

In [None]:
mapper = DataFrameMapper([
    (['Temperature (deg F)', 
        'Wind Speed Average (mph)',
        'Wind Speed Maximum (mph)', 
        '24 Hour Snow (in)',
        'Total Snow Depth (in)',
        'max_24_hour_temp', 'min_24_hour_temp',
        'max_48_hour_temp', 'min_48_hour_temp',
        "mb - 3950' - Stevens Pass - Schmidt Haus",
        '4800ft_Brooks_precip'], ss)
    ])

In [None]:
X = merge_final[[
        'Battery Voltage (v)',
        'Temperature (deg F)',
        'Wind Speed Minimum (mph)', 
        'Wind Speed Average (mph)',
        'Wind Speed Maximum (mph)',
        'Wind Direction (deg.)',
        '24 Hour Snow (in)',
        'Total Snow Depth (in)',
        'max_24_hour_temp', 'min_24_hour_temp',
        'max_48_hour_temp', 'min_48_hour_temp',
        "mb - 3950' - Stevens Pass - Schmidt Haus",
        '4800ft_Brooks_precip'
]]

y = merge_final['danger_near_treeline']

In [None]:
X_train, X_test, y_train, y_test = X[:12553], X[12553:], y[:12553], y[12553:]

In [None]:
baseline_pred = np.mean(y_train)
residuals = (y_test - baseline_pred)
median_absolute_error = np.median(np.abs(residuals))
rmse=np.sqrt(np.mean(residuals**2))

print(median_absolute_error,rmse)

print(f'The MAE of just predicting the mean'
          f' is: {median_absolute_error}\n'
          f'The RMSE just predicting the mean'
          f' is: {rmse}\n'
      '==================================================================\n'
          '========================\n')

for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        ('scaler',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error((y_test),preds)}\n'
          f'The RMSE of the {name}'
          f' is: {mean_squared_error((y_test),preds)}\n'
          '==================================================================\n'
          '========================\n'
         )
    
