In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle

In [4]:
# %load ../src/website/pipeline_classes.py
import pandas as pd 
import numpy as np 
from sklearn.base import BaseEstimator, TransformerMixin

class Featurizer(BaseEstimator, TransformerMixin):
    """Transform incoming df to fit into model"""
   
    def __init__(self, cols=None):
        """INPUT: an optional cols list of columns to select"""
        if cols==None:
            self.cols = ['date', 'temp', 'precipitation', 'overcast', 'poor_visibility', 'windy']
        else:
            self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """tranform incoming training or test"""
        df = X.copy()
        df.index = df.date
        date_column = pd.Series(df.index)
        month_day_of_week = pd.DataFrame({"year": date_column.dt.year,
                                        "month": date_column.dt.month, 
                                        "day": date_column.dt.day,
                                        "dayofweek": date_column.dt.dayofweek})
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 0] = 'Monday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 1] = 'Tuesday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 2] = 'Wednesday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 3] = 'Thursday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 4] = 'Friday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 5] = 'Saturday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 6] = 'Sunday'
        month_day_of_week = pd.get_dummies(month_day_of_week)
        month_day_of_week.index = df.index
        features = pd.concat([df, month_day_of_week], axis=1)
        features.drop(['date'], axis=1, inplace=True)
        return features

In [6]:
# %load ../src/scrape_weather.py
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import datetime

def get_raw_forecast(day):
    '''
    Get raw text from weather.com's 10 day forecast, for the day passed into the function.
    Day = 1 will be today.
    
    Input: int
    Day between 1 and 15, representing the day of the desired forecast.
    
    Output: list of strings
    Raw text for the day chosen, in the following order:
        [daymonth date, weather forecast, hi/lo temp, % chance precipitation, 
         wind speed and direction, humidity]
    '''
    page_link = 'https://weather.com/weather/tenday/l/USWA0413:1:US'
    page_response = requests.get(page_link, timeout=5)
    page_content = BeautifulSoup(page_response.content, "html.parser")

    web_predictions = []
    for i in range(0,105):
        predictions = page_content.find_all('td')[i].text
        web_predictions.append(predictions)
    return web_predictions[(day*7 - 6):(day*7)]

def get_raw_forecast_dataframe():
    ''' 
    Get latest raw weather forecast dataframe for the next 15 days.
    Output: raw dataframe.
    '''
    #Get list of forecasts for the next 15 days
    forecasts = []
    for day in range(1,16):
        forecasts.append(get_raw_forecast(day))
    
    #Set todays date to be used as the start of the dataframe index
    todays_date = datetime.datetime.now().date()
    index = pd.date_range(todays_date, periods=15, freq='D')
    #Set the columns for the dataframe
    columns = ['date','weather','temp', 'precipitation', 'wind', 'humidity']
    #Create the raw forecast dataframe with forecasts list, index and column headings
    forecasts_df = pd.DataFrame(forecasts, index=index, columns=columns)
    return forecasts_df

def get_hi_temperature(temp_string):
    '''
    Take the hi/lo string from weather.com and convert to an integer of the high temperature.
    Input: string
    Output: int
    '''
    numbers = [str(num) for num in range(0,10)]
    temp = ''
    for char in temp_string:
        if char in numbers:
            temp += char
        else:
            break
    #Add error handling for change in website that has -- for high temp for 'Tonight'
    if temp == '':
        return get_low_temperature(temp_string)
    return int(temp)

def get_low_temperature(temp_string):
    '''
    Take the hi/lo string from weather.com and convert to an integer of the
    low temperature.
    Input: string
    Output: int
    '''
    numbers = [str(num) for num in range(0,10)]
    temp = ''
    hilo = temp_string
    for char in reversed(hilo[:-1]):
        if char in numbers:
            temp = char + temp
        else:
            break
    return int(temp)

def get_wind_speed(temp_string):
    '''
    Take the wind string from weather.com and convert to an integer of the wind speed.
    Input: string
    Output: int
    '''
    numbers = [str(num) for num in range(0,10)]
    temp = ''
    for char in temp_string:
        if char in numbers:
            temp += char
    return int(temp)

def get_overcast(weather_string):
    """
    Take the weather string from weather.com and convert to a 1 if string contains words in overcast_list
    otherwise return a 0
    Input: string
    output: int(0 or 1)
    """
    overcast_list = ['Cloudy', 'Snow', 'Rain', 'Showers', 'Thunderstorms']
    for word in weather_string.split():
        if word in overcast_list:
            return 1
        else:
            return 0

def get_poor_visibility(weather_string):
    """
    Take the weather string from weather.com and convert to a 1 if string contains words in poor_visibility_list
    otherwise return a 0
    Input: string
    output: int(0 or 1)
    """
    poor_visibility_list = ['Snow', 'Rain', 'Fog', 'Mist']
    for word in weather_string.split():
        if word in poor_visibility_list:
            return 1
        else:
            return 0

def snoqualmie_pass_prediction_components():
    """
    Get latest cleaned features from weather.com to be used to gain closure probabilities
    Output: Clean dataframe to be used for predicting probability of closure
    """
    forecasts_df = get_raw_forecast_dataframe()
    
    #Extract high/low and calulate average temperature
    forecasts_df['high'] = forecasts_df['temp'].map(get_hi_temperature)
    forecasts_df['low'] = forecasts_df['temp'].map(get_low_temperature)
    forecasts_df['average'] = ((forecasts_df['high'] + forecasts_df['low']) / 2)

    #Re-use get_hi_temperature function to get just the integer from precipitation
    forecasts_df['precip'] = forecasts_df['precipitation'].map(get_hi_temperature)
    #Set precipitation to 1 if chance of precipitation is >= 30%
    forecasts_df['is_precipitating'] = forecasts_df['precip'].apply(lambda x: 1 if (x >= 30) else 0)

    #Extract wind speed and set to 1 if windspeed >= 10mph
    forecasts_df['wind_int'] = forecasts_df['wind'].map(get_wind_speed)
    forecasts_df['windy'] = forecasts_df['wind_int'].apply(lambda x: 1 if (x >= 10) else 0)

    #Infer overcast and poor visibility
    forecasts_df['overcast'] = forecasts_df['weather'].map(get_overcast)
    forecasts_df['poor_visibility'] = forecasts_df['weather'].map(get_poor_visibility)

    #Drop unwanted columns
    forecasts_df = forecasts_df.drop(['temp','high','low','precipitation','precip','wind','wind_int','weather','humidity','date'], axis=1)

    #Add date index to be new date column
    forecasts_df.reset_index(inplace=True)

    #Rename columns
    forecasts_df.rename(columns={'average':'temp', 'is_precipitating':'precipitation', 'index':'date' }, inplace=True)

    return forecasts_df



In [7]:
forecasts_df = snoqualmie_pass_prediction_components()

In [8]:
forecasts_df

Unnamed: 0,date,temp,precipitation,windy,overcast,poor_visibility
0,2018-12-05,20.0,0,0,0,0
1,2018-12-06,20.5,0,1,0,0
2,2018-12-07,22.0,0,0,0,0
3,2018-12-08,25.0,0,0,0,0
4,2018-12-09,26.0,1,0,1,1
5,2018-12-10,28.0,1,0,1,1
6,2018-12-11,30.0,1,0,1,1
7,2018-12-12,29.0,1,0,1,1
8,2018-12-13,27.0,1,0,1,1
9,2018-12-14,25.0,1,0,1,1


In [10]:
with open('../data/pickled_pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

In [1]:
cd ../src/website

/Users/leanne/galvanize/Pass_Closure/src/website


In [4]:
# %load ../src/website/prediction.py
import pandas as pd
import numpy as np
from pipeline_classes import Featurizer
from sklearn.ensemble import RandomForestClassifier
import pickle

with open('../../data/pickled_pipe.pkl', 'rb') as f:
    pipe = pickle.load(f)

def get_predictions(df):
    """function to use pickled pipeline to get predictions on new scraped weather forecast data
    Input: pandas dataframe
    Output: numpy array of predictions
    """
    forecast_probs = pipe.predict_proba(df)
    return forecast_probs

def get_one_prediction(row: dict) -> float:
    """make a prediction for a single event, i.e. a single row from Mongo DB where scraped weather data is stored
    Input: dict
    Output: float
    """
    df = pd.DataFrame([row])
    predictions = get_predictions(df)
    return predictions[0, 1]


In [14]:
get_predictions(forecasts_df)

array([[0.90666667, 0.09333333],
       [0.915     , 0.085     ],
       [0.94333333, 0.05666667],
       [0.93      , 0.07      ],
       [0.85      , 0.15      ],
       [0.83166667, 0.16833333],
       [0.86333333, 0.13666667],
       [0.88833333, 0.11166667],
       [0.88833333, 0.11166667],
       [0.86      , 0.14      ],
       [0.92666667, 0.07333333],
       [0.905     , 0.095     ],
       [0.86166667, 0.13833333],
       [0.78      , 0.22      ],
       [0.84333333, 0.15666667]])

In [6]:
with open('../../data/final_pickled_pipe.pkl', 'rb') as f:
    pipe_final = pickle.load(f)