In [77]:
# %load ../src/pipeline_classes.py
import pandas as pd 
import numpy as np 
from sklearn.base import BaseEstimator, TransformerMixin

class Featurizer(BaseEstimator, TransformerMixin):
    """Transform incoming df to fit into model"""
   
    def __init__(self, cols=None):
        """INPUT: an optional cols list of columns to select"""
        if cols==None:
            self.cols = ['date', 'temp', 'precipitation', 'overcast', 'poor_visibility', 'windy']
        else:
            self.cols = cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """tranform incoming training or test"""
        df = X.copy()
        df.index = df.date
        date_column = pd.Series(df.index)
        month_day_of_week = pd.DataFrame({"year": date_column.dt.year,
                                        "month": date_column.dt.month, 
                                        "day": date_column.dt.day,
                                        "dayofweek": date_column.dt.dayofweek})
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 0] = 'Monday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 1] = 'Tuesday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 2] = 'Wednesday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 3] = 'Thursday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 4] = 'Friday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 5] = 'Saturday'
        month_day_of_week.dayofweek[month_day_of_week.dayofweek == 6] = 'Sunday'
        month_day_of_week = pd.get_dummies(month_day_of_week)
        month_day_of_week.index = df.index
        features = pd.concat([df, month_day_of_week], axis=1)
        features.drop(['date'], axis=1, inplace=True)
        return features

In [10]:
# %load ../src/weather_data_clean.py
import pandas as pd

def clean_weather_data(filename):
    """Take ASOS weather data file for Stampede pass and clean it ready for input to model.
    Input: txt file
    Output: pandas dataframe
    """
    data = pd.read_csv(filename)

    # Rename two of the columns
    data.rename(columns={'valid':'date', 'tmpf':'temp'}, inplace=True)

    # Remove the few rows that have a null value for temp
    data = data[~data.temp.eq('M')]

    # Remove spaces from column names
    data.rename(columns=lambda x: x.replace(' ', ''), inplace=True)

    # Only use the standard hourly weather reading at 56 mins past each hour
    mask = data['date'].apply(lambda x: x[-2:] == '56')
    data = data[mask]

    # Create a date series to be used in the clean dataframe
    date = pd.to_datetime(data['date'])

    # Create a temp series to be used in the clean dataframe
    temp = data['temp'].apply(float)

    # Cast the null value M to zero to enable create of the raw precipitation series cast to floats
    data.p01i[data.p01i == 'M'] = 0
    raw_precipitation = data['p01i'].apply(float)

    # Create a precipitation series to be used in the clean dataframe
    precipitation = raw_precipitation.apply(lambda x: True if (x > 0) else False)

    # Convert sky coverage data to clear or cloudy and create an overcast series to be used in the clean dataframe
    sky_elements = ['skyc1', 'skyc2', 'skyc3']
    data.skyc1 = data.skyc1.astype(str)
    data.skyc2 = data.skyc2.astype(str)
    data.skyc3 = data.skyc3.astype(str)
    sky_agg = data[sky_elements].values.tolist()
    sky_reduce = [['overcast' if (('BKN' in element) or ('OVC' in element) or ('VV' in element)) else 'clear'
                    for element in row] for row in sky_agg]
    overcast = pd.Series([True if 'overcast' in row else False for row in sky_reduce])
    overcast.index = date.index

    # Cast the null value 'M' to 10.00 to enable the creation of a poor visibility series
    data.vsby[data.vsby == 'M'] = 10.00
    raw_visibility = data['vsby'].apply(float)
    poor_visibility = pd.Series([True if value < 0.50 else False for value in raw_visibility])
    poor_visibility.index = date.index

    # Cast the null value 'M' to 0 to enable the creation of a windy series
    data.sknt[data.sknt == 'M'] = 0.00
    data.gust[data.gust == 'M'] = 0.00
    wind_speed = data['sknt'].apply(float)
    gust_speed = data['gust'].apply(float)
    wind_df = pd.concat([wind_speed, gust_speed], axis=1)
    # Finally apply the function f to enable the creation of the windy column
    windy = wind_df.apply(f, axis=1)

    """Create the cleaned dataframe by concatenating the date, temp, precipitation, overcast, poor_visibility
    and windy series"""
    df = pd.concat([date, temp, precipitation, overcast, poor_visibility, windy], axis=1)
    df.columns = ['date', 'temp', 'precipitation', 'overcast', 'poor_visibility', 'windy']
    cleaned_df = df[(df['date'] > '2006-12-31') & (df['date'] < '2018-04-03')]
    return cleaned_df

def f(row):
    """Function to be able to create the windy series with windy being true if wind speed is above 10 knots
    or gust speed is above 20 knots"""
    if row['sknt'] >= 10.00:
        val = True
    elif row['gust'] >= 20.00:
        val = True
    else:
        val = False
    return val




In [12]:
# %load ../src/pass_data_clean.py
import pandas as pd

def clean_pass_data(filename):
    """Take Snoqualmie pass closure data file and clean it ready for input to model.
    Input: xlsx file
    Output: Pandas dataframe
    """
    data = pd.read_excel(filename, header=[1])

    #drop unnamed/unnecessary columns
    data.drop(data.columns[[11,12,13,14]], axis=1, inplace=True)

    #drop unnecessary secondary incident columns
    data.drop(data.columns[[1,8]], axis=1, inplace=True)

    #rename 'Incident...' columns to start_time and end_time 
    data.rename(columns={'INCIDENT START TIMES FOR EACH DIRECTION':'start_time'}, inplace=True)
    data.rename(columns={'INCIDENT END TIMES - DIRECTIONAL':'end_time'}, inplace=True)

    #use only dates from 2007-01-01 to match with available weather and traffic volume data
    df = data[(data['start_time'] > '2006-12-31')]

    #rename 'Delay Time Total' to delay
    df.rename(columns={'Delay Time Total':'delay'}, inplace=True)

    #drop row with nan value in delay
    df = df.dropna(subset=['delay'])

    #create a westbound pandas series with True if westbound and false if eastbound
    westbound = pd.Series([True if value == 'WB' else False for value in df.DIRECTION])

    #create a snow pandas series with True if weather description contains sn, false otherwise
    snow = df.WEATHER.str.contains('sn', case=False, na=False, regex=True)

    #create pandas series for start and end times
    start_time = pd.to_datetime(df['start_time'])
    end_time = pd.to_datetime(df['end_time'])

    #ensure that all the pandas series created have the same index
    westbound.index = start_time.index
    snow.index = start_time.index
    end_time.index = start_time.index

    #create cleaned df with the series created
    cleaned_df = pd.concat([start_time, end_time, westbound, snow], axis=1)

    #rename columns
    cleaned_df.rename(columns={0:'westbound', 'WEATHER':'snow'}, inplace=True)

    return cleaned_df



In [17]:
# %load ../src/combine_data.py
import pandas as pd
#from pass_data_clean import clean_pass_data

pass_closure_df = clean_pass_data('Cumulative_Snoqualmie_Pass_Delay_Closures_1992_2018.xlsx')

def get_pass_closure(date_time):
    """take a date_time and check if it is between the start and end times of a closure event
    input: datetime
    output: boolean
    """
    start_end_times = list(zip(pass_closure_df.start_time, pass_closure_df.end_time))
    for row in start_end_times:
        if row[0] <= date_time <= row[1]:
            return True
    return False

def add_pass_closed(df):
    """take the weather df and add a new column for whether or not the pass is closed at each date_time
    input: pandas dataframe
    output: pandas dataframe
    """
    df['pass_closed'] = df['date'].map(get_pass_closure)
    return df

def true_false_to_one_zero(df):
    """take the combined df and change all the true/false values to 1/0
    input: pandas dataframe
    output: pandas dataframe
    """
    df[['precipitation', 'overcast', 'poor_visibility', 'windy', 'pass_closed']] = (
        df[['precipitation', 'overcast', 'poor_visibility', 'windy', 'pass_closed']] == True).astype(int)
    return df

def aggregate_data_to_daily(df):
    """take the combined df and aggregate the data into daily rather than hourly data to be used to train the model
    input: pandas dataframe
    output: pandas dataframe
    """
    df.index = df.date
    daily_df = df.resample("D").agg({'temp':'mean','precipitation':'max', 'overcast':'max', 'poor_visibility':'max', 'windy':'max', 'pass_closed':'max'})
    daily_df.dropna(inplace=True)
    daily_df.reset_index(inplace=True)
    daily_df.rename(columns={'index':'date'}, inplace=True)
    return daily_df


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [31]:
# %load ../src/model.py
import pandas as pd
import numpy as np
#from pipeline_classes import Featurizer
#from weather_data_clean import clean_weather_data
#from pass_data_clean import clean_pass_data
#from combine_data import get_pass_closure, add_pass_closed, true_false_to_one_zero, aggregate_data_to_daily
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import pickle

def get_data():
    """function to get all the original data that is required to train the model
    Output: pandas dataframe to use to train model"""
    weather_df = clean_weather_data('../data_exploration/ASOS_stampede_pass/SMP-2.txt')
    combined_df = add_pass_closed(weather_df)
    combined_df = true_false_to_one_zero(combined_df)
    daily_df = aggregate_data_to_daily(combined_df)
    return daily_df

def get_training_data():
    """get the training data that used to train the model
    Output: X, y used to train the model"""
    df = get_data()
    y = df['pass_closed']
    X = df.drop('pass_closed', axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
    return X_train, X_test, y_train, y_test

def pass_pipeline():
    """instantiate a pipeline object"""
    pipeline = Pipeline([
        ('featurizer', Featurizer()),
        ('model', RandomForestClassifier(n_estimators=600, 
                                         max_depth=40))
        ])
    return pipeline

def pickle_pipeline(pipeline, output_name):
    """Save fitted pipeline to pickle file"""
    with open(output_name, 'wb') as f:
        pickle.dump(pipeline, f)


In [32]:
X_train, X_test, y_train, y_test = get_training_data()

  """Entry point for launching an IPython kernel.


In [70]:
X_train

Unnamed: 0,date,temp,precipitation,overcast,poor_visibility,windy
1962,2014-06-11,57.368750,0.0,1.0,0.0,0.0
1839,2013-02-18,22.460000,0.0,1.0,0.0,0.0
655,2008-11-11,40.718261,1.0,1.0,1.0,0.0
610,2008-09-27,49.527500,0.0,1.0,1.0,0.0
2228,2016-06-04,67.925000,0.0,0.0,0.0,0.0
1444,2011-07-09,47.588000,0.0,0.0,0.0,0.0
1271,2010-10-26,30.380000,1.0,1.0,1.0,0.0
2209,2016-05-16,42.350000,0.0,1.0,1.0,0.0
1357,2011-02-21,23.747000,1.0,1.0,1.0,1.0
2031,2014-08-22,51.080000,0.0,0.0,0.0,0.0


In [78]:
features = Featurizer()

In [79]:
features.transform(X_train)

Unnamed: 0_level_0,temp,precipitation,overcast,poor_visibility,windy,year,month,day,dayofweek_Friday,dayofweek_Monday,dayofweek_Saturday,dayofweek_Sunday,dayofweek_Thursday,dayofweek_Tuesday,dayofweek_Wednesday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2014-06-11,57.368750,0.0,1.0,0.0,0.0,2014,6,11,0,0,0,0,0,0,1
2013-02-18,22.460000,0.0,1.0,0.0,0.0,2013,2,18,0,1,0,0,0,0,0
2008-11-11,40.718261,1.0,1.0,1.0,0.0,2008,11,11,0,0,0,0,0,1,0
2008-09-27,49.527500,0.0,1.0,1.0,0.0,2008,9,27,0,0,1,0,0,0,0
2016-06-04,67.925000,0.0,0.0,0.0,0.0,2016,6,4,0,0,1,0,0,0,0
2011-07-09,47.588000,0.0,0.0,0.0,0.0,2011,7,9,0,0,1,0,0,0,0
2010-10-26,30.380000,1.0,1.0,1.0,0.0,2010,10,26,0,0,0,0,0,1,0
2016-05-16,42.350000,0.0,1.0,1.0,0.0,2016,5,16,0,1,0,0,0,0,0
2011-02-21,23.747000,1.0,1.0,1.0,1.0,2011,2,21,0,1,0,0,0,0,0
2014-08-22,51.080000,0.0,0.0,0.0,0.0,2014,8,22,1,0,0,0,0,0,0


In [80]:
pipe = pass_pipeline()

In [81]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('featurizer', Featurizer(cols=['date', 'temp', 'precipitation', 'overcast', 'poor_visibility', 'windy'])), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=40, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrea...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])