# Data Cleaning + Preperation

In [78]:
from datetime import date, datetime, timedelta
import pandas as pd
import holidays
import pytz

## Functions

In [47]:
def removeTzRows(data, dtList): 
    """remove rows of datetimes that do not exist due to tz changes."""
    for d in dtList:
        if d in data.index:
            data.drop(d, inplace=True)
            print(str(d) + " dropped.")  
            
    return data


def newIndex(start, end, delta):
    """create a new index for reindexing."""
    curr = start
    while curr < end:
        yield curr
        curr += delta

## Constants

In [75]:
startDt = datetime(2017, 1, 1, 0, 0)
endDt = datetime(2020, 1, 1, 0, 0)
# datetimes that should not exist due to tz changes
tzDates = [datetime(2017,3,12,2,0), datetime(2017,3,12,2,30),
            datetime(2018,3,11,2,0), datetime(2018,3,11,2,30),
            datetime(2019,3,10,2,0), datetime(2019,3,10,2,30)
           ]
usHolidays = holidays.UnitedStates()

## Load and preprocess data

In [118]:
def prepData(data, delta=30):
    # some zones have missing values
    # that are supposed to be zero.
    data.fillna(0, axis=1, inplace=True)
    
    # drop the hour between 2 and 3 
    # when US time goes so summer time
    data = removeTzRows(data, tzDates)
    
    # drop duplicate index rows
    duplicateNumber = data.index.duplicated().astype(int).sum()
    data = data.loc[~data.index.duplicated(keep='first')]
    print(str(duplicateNumber) + " duplicates found and removed.")
    
    # create new index with 30 minute time interval
    reindx = list()
    for result in newIndex(startDt, endDt, timedelta(minutes=delta)):
        reindx.append(result)
        
    # reindex so there are no missing datetimes
    data = data.reindex(reindx)
    
    # fill missing values with their previous one
    data.fillna(method="ffill", inplace=True)
    
    return data

In [119]:
# load data, set index as type datetime
pu = pd.read_csv("zonePickups.csv", index_col="time_bin", parse_dates=["time_bin"])

In [120]:
pu = prepData(pu)

2017-03-12 02:00:00 dropped.
2017-03-12 02:30:00 dropped.
2018-03-11 02:30:00 dropped.
0 duplicates found and removed.


In [121]:
# create a list containing all zone IDs
zoneIds = pu.columns.tolist()

## Feature Adding

In [122]:
def addCalendarFeatures(data):
    # add year, month, hour and weekday
    data["dt"] = data.index
    data["year"] = data["dt"].apply(lambda x: x.year)
    data["month"] = data["dt"].apply(lambda x: x.month)
    data["hour"] = data["dt"].apply(lambda x: x.hour)
    data["weekday"] = data["dt"].apply(lambda x: date(x.year, x.month, x.day).weekday())
    
    # add dummy for if a day is an US holiday
    data["holiday"] = data["dt"].apply(lambda x: x in usHolidays).astype(int)
    
    # drop helper column dt
    data.drop("dt", axis=1, inplace=True)
    
    return data

In [123]:
pu = addCalendarFeatures(pu)

# Weather Data

In [124]:
def loadWeatherData(path, tarTimezone="US/Eastern", dateFormat="%Y-%m-%d %H:%M"):
    """Read weather data, transform datetime to target timezone, set as index.

    Keyword arguments:
    path -- path of weather data with data type
    tarTimezone -- target timezone date is transformed into
    dateFormat -- format of datetime
    """
    w = pd.read_csv(path)
    eastern = pytz.timezone(tarTimezone)
    fmt = dateFormat
    w["dt_iso"] = w["dt_iso"].apply(lambda x: pd.to_datetime(x[:-10], utc=True))
    w["dt"] = w["dt_iso"].apply(lambda x: x.astimezone(eastern).strftime(fmt))
    w["dt"] = w["dt"].apply(lambda x: pd.to_datetime(x))
    w = w.set_index("dt")
    
    return w[["temp", "feels_like", "temp_min", "temp_max", "humidity", "wind_speed", "weather_main"]]

In [125]:
w = loadWeatherData("weather.csv")

In [126]:
w = prepData(w)

322 duplicates found and removed.


In [127]:
# join data with weather data
pu = pu.join(w)

In [128]:
# check if all columns are joined correctly
pu.drop(columns=zoneIds).columns

Index(['year', 'month', 'hour', 'weekday', 'holiday', 'temp', 'feels_like',
       'temp_min', 'temp_max', 'humidity', 'wind_speed', 'weather_main'],
      dtype='object')

In [130]:
# check if every day has 48 entries
len(pu) % 48 == 0

True

In [131]:
pu.to_csv("zonePickupsFinal.csv")

In [134]:
def allInOne(path, weatherPath="weather.csv", saveFileName=None):
    # load data, set index as type datetime
    data = pd.read_csv(path, index_col="time_bin", parse_dates=["time_bin"])
    
    data = prepData(data)
    
    # create a list containing all zone IDs
    zoneIds = data.columns.tolist()
    
    data = addCalendarFeatures(data)
    
    w = loadWeatherData(weatherPath)
    w = prepData(w)
    
    # join data with weather data
    data = data.join(w)
    
    # check if all columns are joined correctly
    cols = data.drop(columns=zoneIds).columns
    print("columns ex zones:")
    for col in cols:
        print(col)
        
    # check if every day has 48 entries
    if len(pu) % 48 == 0:
        print("No odd rows.")
    else:
        print("There might be odd/duplicate index rows. Check!")
        
    if saveFileName != None:
        data.to_csv(saveFileName)
        print("Saved as " + saveFileName)
    
    return data

In [136]:
#test = allInOne("zonePickups.csv", saveFileName="test.csv")