In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dt
from xgboost import XGBRFRegressor, XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error


In [2]:
df = pd.read_json('json_archive/model_5_df.json')
df['Date/Time (PST)'] = pd.to_datetime(df['Date/Time (PST)'])

In [3]:
df = df.sort_values(by='Date/Time (PST)',ascending=True)

In [4]:
ddf = df.drop_duplicates(subset='Date/Time (PST)').copy()
ddf['id'] = ddf.index
ddf['Date'] = ddf['Date/Time (PST)']
ddf.index = ddf['Date/Time (PST)']
ddf = ddf.drop('Date/Time (PST)', axis=1)



In [None]:
ddf['24 Hour Snow (in)'].astype(float)

In [None]:
ddf.loc[ddf["Date"] == '2015-12-04 01:00:00',"Date"]

In [None]:
df.loc[df["Date/Time (PST)"] == '2015-12-04 01:00:00','Date/Time (PST)']

In [None]:
ddf.info()

In [None]:
# def get_previous_day_snow_kv(df, dt):
#     target_hour = dt - pd.Timedelta('1 day')
#     if target_hour in df.Date:
#         return df.loc[target_hour,'24 Hour Snow (in)']
#     else:
#         return None

In [5]:
import pdb

In [6]:
def get_previous_day_snow(df, dt):
    target_hour = dt - pd.Timedelta('24 hours')
    #target_hour_list=[]
    if target_hour in df.index:
        snow = df.loc[target_hour, '24 Hour Snow (in)']
        if type(snow) is not np.float64:
            pdb.set_trace()
        return snow
    else:
        return np.nan
    #return target_hour_list

In [7]:
ddf['Prev Day 24 Hour Snow (in)'] = [get_previous_day_snow(ddf, dt) for dt in ddf.index]


In [8]:
ddf.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12619 entries, 2015-12-04 00:00:00 to 2019-04-13 23:00:00
Data columns (total 27 columns):
Battery Voltage (v)           12619 non-null float64
Temperature (deg F)           12619 non-null float64
Wind Speed Minimum (mph)      12619 non-null float64
Wind Speed Average (mph)      12619 non-null float64
Wind Speed Maximum (mph)      12619 non-null float64
Wind Direction (deg.)         12619 non-null float64
date                          12619 non-null datetime64[ns]
24 Hour Snow (in)             12619 non-null float64
Total Snow Depth (in)         12619 non-null float64
area                          12619 non-null object
danger_above_treeline         12619 non-null int64
danger_below_treeline         12619 non-null int64
danger_near_treeline          12619 non-null int64
date_tomorrow                 12619 non-null object
month                         12619 non-null int64
max_1_day_temp                12619 non-null float64
min_1_day_t

In [None]:
ddf.isna().sum()

In [None]:
ddf[['Prev Day 24 Hour Snow (in)']]

In [None]:
ddf.tail(50)

In [None]:
ddf.shape , df.shape

In [None]:
# weather.Temp.convert_objects(convert_numeric=True)

In [None]:
ddf[[]'Prev Day 24 Hour Snow (in)']]
#.astype(float,errors='ignore')

In [None]:
ddf.isna().sum()

In [None]:
ddf.info()

In [None]:
#from pandas.api.types import is_numeric_dtype

In [None]:
#ddf.loc[ddf['Prev Day 24 Hour Snow (in)'].apply(lambda x: x.is_numeric_dtype)]

In [None]:
ddf.info()

In [None]:
ddf_clean = ddf.dropna()

In [None]:
ddf_clean.isna().sum()

In [None]:
ddf_clean[['Prev Day 24 Hour Snow (in)']]

In [None]:
ddf.isna().sum()

In [None]:
ddf = ddf[:15698]

In [None]:
ddf.shape[0]-25

The function created is not outputting numbers, rather objects

In [None]:
ddf.info()

In [None]:
ddf_clean.describe()

In [None]:
X = ddf_clean[['Battery Voltage (v)', 'Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            '24 Hour Snow (in)', 'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow',
       ]]

y = ddf_clean['danger_near_treeline']

X_train, X_test, y_train, y_test = X[:12578], X[12578:], y[:12578], y[12578:]

In [None]:
ss = StandardScaler()

lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=100)
gbr = GradientBoostingRegressor(n_estimators=100)
knnr = KNeighborsRegressor()
regressors = [lr, rfr, gbr, knnr ]

reg_names = ['LinearRegression',
             'RandomForestRegressor',
             'GradientBoostingRegressor',
             'KNeighborsRegressor',
            ]

In [None]:
mapper = DataFrameMapper([
    ([   'Battery Voltage (v)', 'Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            '24 Hour Snow (in)', 'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow',], ss)
    ])

In [None]:
baseline_pred = np.mean(y_train)
residuals = (y_test - baseline_pred)
median_absolute_error = np.median(np.abs(residuals))
rmse=np.sqrt(np.mean(residuals**2))

print(median_absolute_error,rmse)

print(f'The MAE of just predicting the mean'
          f' is: {median_absolute_error}\n'
          f'The RMSE just predicting the mean'
          f' is: {rmse}\n'
      '==================================================================\n'
          '========================\n')

for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        #('scaler',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error((y_test),preds)}\n'
          f'The RMSE of the {name}'
          f' is: {mean_squared_error((y_test),preds)}\n'
          '==================================================================\n'
          '========================\n'
         )
    

In [None]:
precip_df= pd.read_csv('csv_files/stevens_pass_precipitation_2014_now.csv')


precip_df.columns 

In [None]:
precip_df.columns = ['Date/Time (PST)', '2700ft_berne',
       '3950ft_schmidt_haus',
       '4800_brooks']

In [None]:
precip_df.describe()

In [None]:
precip_df['Date/Time (PST)'] = pd.to_datetime(precip_df['Date/Time (PST)'])

In [None]:
precip_ddf = precip_df.copy()
precip_ddf['id'] = precip_ddf.index
precip_ddf.index = precip_ddf['Date/Time (PST)']
precip_ddf = precip_ddf.drop('Date/Time (PST)', axis=1)

In [None]:
df_merge = pd.merge(ddf,precip_df,how='left',on="Date/Time (PST)")

In [None]:
df_merge.describe()

In [None]:
df_merge.columns

In [None]:
df_cleaned = df_merge.dropna()

In [None]:
df_cleaned.shape[0]*.80

In [None]:
mapper = DataFrameMapper([
    ([   'Battery Voltage (v)','Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow','max_2_day_snow','max_3_day_snow','4800_brooks'], ss)
    ])

X = df_cleaned[['Battery Voltage (v)','Temperature (deg F)',
            'Wind Speed Minimum (mph)', 'Wind Speed Average (mph)',
            'Wind Speed Maximum (mph)', 'Wind Direction (deg.)',
            'Total Snow Depth (in)','max_1_day_temp',
            'min_1_day_temp', 'max_2_day_temp', 'min_2_day_temp',
            'max_1_day_snow','max_2_day_snow','max_3_day_snow','4800_brooks']]

y = df_cleaned['danger_near_treeline']

X_train, X_test, y_train, y_test = X[:12142], X[12142:], y[:12142], y[12142:]

In [None]:
baseline_pred = np.mean(y_train)
residuals = (y_test - baseline_pred)
median_absolute_error = np.median(np.abs(residuals))
rmse=np.sqrt(np.mean(residuals**2))

print(median_absolute_error,rmse)

print(f'The MAE of just predicting the mean'
          f' is: {median_absolute_error}\n'
          f'The RMSE just predicting the mean'
          f' is: {rmse}\n'
      '==================================================================\n'
          '========================\n')

for regressor,name in zip(regressors,reg_names): #zip returns a tuple for each pair it is given
    pipe = Pipeline(steps=[
        ('scaler',mapper),
        ('regressor', regressor)
    ])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    print(f'The MAE of the {name}'
          f' is: {mean_absolute_error((y_test),preds)}\n'
          f'The RMSE of the {name}'
          f' is: {mean_squared_error((y_test),preds)}\n'
          '==================================================================\n'
          '========================\n'
         )
    

In [None]:
pipe2 = Pipeline(steps=[
('scaler',mapper),
('regressor', gbr)
])
pipe2.fit(X_train, y_train)
preds = pipe2.predict(X_test)
print(f'The MAE of the {name}'
f' is: {mean_absolute_error((y_test),preds)}\n'
f'The RMSE of the regressor is:'
f' is: {mean_squared_error((y_test),preds)}\n'
'==================================================================\n'
'========================\n'
)

In [None]:
y_predict = list(pipe2.predict(X_test))

In [None]:
len(y_test),len(y_predict)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

In [None]:
X_test.head()

In [None]:
len(y_predict)

In [None]:
y_predict = np.array(y_predict)

In [None]:
type(y_test),len(y_test)

In [None]:
df_y = (pd.DataFrame({'y_true':y_test,'y_predict':y_predict}))

In [None]:
import seaborn as sns
sns.set(style="whitegrid")


def violin_plot_error(model, X, y_true, rescale=False):
    y_pred = model.predict(X)
    sns.set(font_scale=1.5)
    if rescale:
        y_pred_95 = np.percentile(y_pred, 95)
        y_pred_5 = np.percentile(y_pred, 5)
        scaling_factor = 3 / (y_pred_95 - y_pred_5)
        y_pred -= y_pred_5
        y_pred *= scaling_factor
        y_pred += 1
    df_y = pd.DataFrame({'Human Forecast': y_true, 'Model Prediction': y_pred})
    
    
    fig, axes = plt.subplots(figsize=(7, 7))
    axes.set_ylim([-0.5,6])
    
    #sns.violinplot(x='groups',y='danger level', data=df_violin, ax = axes)
    violin = sns.violinplot(x='Human Forecast',y='Model Prediction', data=df_y, ax=axes).set_title(
        'Danger Level Comparison')
    return violin

In [None]:
y_preds = pipe.predict(X_test)

In [None]:
y_pred_95 = np.percentile(y_preds, 95)
y_pred_5 = np.percentile(y_preds, 5)

In [None]:
scaling_factor = 3 / (y_pred_95 - y_pred_5)

In [None]:
scaling_factor


In [None]:
y_pred_5

In [None]:
y_preds -= y_pred_5

In [None]:
y_preds *= scaling_factor

In [None]:
y_preds += 1

In [None]:
y_preds[200]

In [None]:
# df_violin = pd.melt(df_y,var_name='groups',value_name='danger level')

In [None]:
violin_plot_error(pipe2, X_test, y_test);

In [None]:
violin_plot_error(pipe2, X_test, y_test, rescale=True);


In [None]:
violin_plot_error(pipe2, X_train, y_train);

In [None]:
violin_plot_error(pipe2, X_train, y_train,rescale=True);

In [None]:
pipe2.steps

In [None]:
import pickle

In [None]:
# save the model to disk
filename = 'avy_danger_prediction.pkl'
pickle.dump(pipe2, open(filename, 'wb'))
 

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
X_test[1:2]

In [None]:
X_test.columns

In [None]:
result = loaded_model.predict(X_test[1:2])
print(result)

In [None]:
X_test.head()