holiday, workingdayに関連した特徴量をpickleファイルとしてdumpするnotebook  
メモリを抑える処理などは次のnotebookから拝借した https://www.kaggle.com/kyakovlev/m5-simple-fe

dumpされたpickleファイルのkeyは[id, d]

In [1]:
import numpy as np
import pandas as pd
import datetime
from datetime import date
import os, sys, gc, time, warnings, pickle, psutil, random

In [2]:
!pip install workalendar==8.4.0



In [2]:
from workalendar.usa import California
from workalendar.usa import Texas
from workalendar.usa import Wisconsin

In [3]:
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

In [4]:
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [5]:
## Memory Reducer
# :df pandas dataframe to reduce size             # type: pd.DataFrame()
# :verbose                                        # type: bool
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [6]:
def make_holiday(df):
    
    def get_holiday(year, month, day, state_id):
        if state_id == "CA":
            return 1 if California().is_holiday(date(year, month, day)) else 0
        elif state_id == "TX":
            return 1 if Texas().is_holiday(date(year, month, day)) else 0
        elif state_id == "WI":
            return 1 if Wisconsin().is_holiday(date(year, month, day)) else 0
        else:
            return 0
        
    def get_working_day(year, month, day, state_id):
        if state_id == "CA":
            return 1 if California().is_working_day(date(year, month, day)) else 0
        elif state_id == "TX":
            return 1 if Texas().is_working_day(date(year, month, day)) else 0
        elif state_id == "WI":
            return 1 if Wisconsin().is_working_day(date(year, month, day)) else 0
        else:
            return 0
        
    tmp = df.drop_duplicates(subset=["date", "state_id"])
    
    years  = tmp["date"].dt.year.values
    months = tmp["date"].dt.month.values
    days   = tmp["date"].dt.day.values
    states = tmp["state_id"].values
    
    holidays = []
    workingdays = []
    
    for i in range(len(years)):
        holidays.append(get_holiday(years[i], months[i], days[i], states[i]))
        workingdays.append(get_working_day(years[i], months[i], days[i], states[i]))
        
    # df["is_holiday"] = df.apply(lambda x: get_holiday(x["year"], x["month"], x["mday"], x["state_id"]), axis=1)
    tmp["is_holiday"] = holidays
    tmp["is_workingday"] = workingdays
    for_merge = tmp[["date", "state_id", "is_holiday", "is_workingday"]]
    df = pd.merge(df, for_merge, on=["date", "state_id"], how="left")
    
    return df

In [7]:
def days_to_next_holiday(df):
    cal_holidays = {}
    for state_id in ["CA", "TX", "WI"]:
        if state_id == "CA":
            cal = California()
        elif state_id == "TX":
            cal = Texas()
        elif state_id == "WI":
            cal = Wisconsin()
        else:
            raise RuntimeError
        
        holidays = []
        for year in range(2011, 2017):
            for holiday in cal.holidays(year):
                if holiday[0] in holidays:
                    continue
                holidays.append(holiday[0])
        
        cal_holidays[state_id] = holidays
    
    tmp = df.drop_duplicates(subset=["date", "state_id"])
    days_to_next_holiday = []
    for i, row in tmp.iterrows():
        d = row["date"].date()
        holidays = cal_holidays[row["state_id"]]
        for h in range(len(holidays) - 1):
            if holidays[h + 1] == d:
                days_to_next_holiday.append(int((holidays[h + 2] - d).days))
                break
            elif holidays[h] <= d < holidays[h + 1]:
                days_to_next_holiday.append(int((holidays[h + 1] - d).days))
                break
            else:
                pass

    assert len(tmp) == len(days_to_next_holiday), print(len(tmp), len(days_to_next_holiday))
    tmp["days_to_next_holiday"] = days_to_next_holiday
    for_merge = tmp[["date", "state_id", "days_to_next_holiday"]]
    df = pd.merge(df, for_merge, on=["date", "state_id"], how="left")
    
    return df

def days_from_prev_holiday(df):
    cal_holidays = {}
    for state_id in ["CA", "TX", "WI"]:
        if state_id == "CA":
            cal = California()
        elif state_id == "TX":
            cal = Texas()
        elif state_id == "WI":
            cal = Wisconsin()
        else:
            raise RuntimeError
        
        holidays = []
        for year in range(2011, 2017):
            for holiday in cal.holidays(year):
                if holiday[0] in holidays:
                    continue
                holidays.append(holiday[0])
        
        cal_holidays[state_id] = holidays
    
    tmp = df.drop_duplicates(subset=["date", "state_id"])
    days_from_prev_holiday = []
    for i, row in tmp.iterrows():
        d = row["date"].date()
        holidays = cal_holidays[row["state_id"]]
        for h in range(1, len(holidays)):
            if holidays[h - 1] == d:
                days_from_prev_holiday.append(int((d - holidays[h - 2]).days))
                break
            elif holidays[h - 1] < d <= holidays[h]:
                days_from_prev_holiday.append(int((d - holidays[h - 1]).days))
                break
            else:
                pass

    assert len(tmp) == len(days_from_prev_holiday), print(len(tmp), len(days_from_prev_holiday))
    tmp["days_from_prev_holiday"] = days_from_prev_holiday
    for_merge = tmp[["date", "state_id", "days_from_prev_holiday"]]
    df = pd.merge(df, for_merge, on=["date", "state_id"], how="left")
    
    return df

In [8]:
def near_day_is_holiday(df):
    def check_next_day_is_holiday(row):
        state_id = row['state_id']
        if state_id == "CA":
            cal = California()
        elif state_id == "TX":
            cal = Texas()
        elif state_id == "WI":
            cal = Wisconsin()
        else:
            raise RuntimeError
                
        d = row["date"]
        next_day = d + datetime.timedelta(days=1)
        return 1 if cal.is_holiday(next_day) else 0
    
    def check_prev_day_is_holiday(row):
        state_id = row['state_id']
        if state_id == "CA":
            cal = California()
        elif state_id == "TX":
            cal = Texas()
        elif state_id == "WI":
            cal = Wisconsin()
        else:
            raise RuntimeError
                
        d = row["date"]
        prev_day = d - datetime.timedelta(days=1)
        return 1 if cal.is_holiday(prev_day) else 0
    
    tmp = df.drop_duplicates(subset=["date", "state_id"])
    tmp['next_day_is_holiday'] = tmp.apply(check_next_day_is_holiday, axis=1)
    tmp['prev_day_is_holiday'] = tmp.apply(check_prev_day_is_holiday, axis=1)
    
    for_merge = tmp[["date", "state_id", "next_day_is_holiday", "prev_day_is_holiday"]]
    df = pd.merge(df, for_merge, on=["date", "state_id"], how="left")
    
    return df

In [9]:
def monday_or_friday(df):
    tmp = df.drop_duplicates(subset=["date", "wday"])
    tmp['monday_or_friday'] = tmp["wday"].apply(lambda x: 1 if x == 3 or x == 7 else 0)
    
    for_merge = tmp[["date", "monday_or_friday"]]
    df = pd.merge(df, for_merge, on=["date"], how="left")
    
    return df

In [30]:
train_df = pd.read_csv('./sales_train_evaluation.csv')
calendar_df = pd.read_csv("../data/calendar.csv")

In [31]:
train_df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,d_11,d_12,d_13,d_14,d_15,d_16,d_17,d_18,d_19,d_20,d_21,d_22,d_23,d_24,d_25,d_26,d_27,d_28,d_29,d_30,d_31,d_32,d_33,d_34,d_35,d_36,d_37,d_38,d_39,d_40,d_41,d_42,d_43,d_44,...,d_1892,d_1893,d_1894,d_1895,d_1896,d_1897,d_1898,d_1899,d_1900,d_1901,d_1902,d_1903,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,d_1929,d_1930,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,4,2,3,0,1,2,0,0,0,1,1,3,0,1,1,1,3,0,1,1,0,0,0,2,0,3,5,0,0,1,1,0,2,1,2,2,1,0,2,4,0,0,0,0,3,3,0,1
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,2,1,1,0,0,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0,1,2,2,1,2,1,1,1,0,1,1,1,0,0,1,1,0,2,1,0,0,0,0,2,1,3,0,0,1,0,1,0,2,0,0,0,2,3,0,1
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,2,0,1,0,...,2,1,3,1,0,2,5,4,2,0,3,0,1,0,5,4,1,0,1,3,7,2,0,0,1,2,4,1,6,4,0,0,0,2,2,4,2,1,1,1,1,1,0,4,0,1,3,0,2,6
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,4,0,1,0,1,0,1,1,2,0,1,1,2,1,1,0,1,1,2,2,2,4,1,0,2,3,1,0,3,2,3,1,1,3,2,3,2,2,2,2,0,0,0,2,1,0,0,2,1,0


In [32]:
calendar_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [33]:
TARGET = 'sales' 
index_columns = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id']
grid_df = pd.melt(train_df, 
                  id_vars=index_columns, 
                  var_name='d', 
                  value_name=TARGET)
del train_df

# Let's check our memory usage
print("{:>20}: {:>8}".format('Original grid_df', sizeof_fmt(grid_df.memory_usage(index=True).sum())))

# We can free some memory 
# by converting "strings" to categorical
# it will not affect merging and 
# we will not lose any valuable data
for col in index_columns:
    grid_df[col] = grid_df[col].astype('category')

# Let's check again memory usage
print("{:>20}: {:>8}".format('Reduced grid_df',sizeof_fmt(grid_df.memory_usage(index=True).sum())))

    Original grid_df:   3.5GiB
     Reduced grid_df:   1.3GiB


In [36]:
aaa = pd.read_pickle("../20200602_追加データの確認/grid_part_1_update.pkl")

In [37]:
aaa

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release
0,HOBBIES_1_008_CA_1_evaluation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,1,12.0,0
1,HOBBIES_1_009_CA_1_evaluation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,1,2.0,0
2,HOBBIES_1_010_CA_1_evaluation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0
3,HOBBIES_1_012_CA_1_evaluation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,1,0.0,0
4,HOBBIES_1_015_CA_1_evaluation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,1,4.0,0
...,...,...,...,...,...,...,...,...,...
47735392,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,1969,,0
47735393,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,1969,,0
47735394,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,1969,,0
47735395,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1969,,230


In [39]:
grid_df_2 = aaa[["id", "state_id", "d"]].copy()

In [40]:
grid_df_2

Unnamed: 0,id,state_id,d
0,HOBBIES_1_008_CA_1_evaluation,CA,1
1,HOBBIES_1_009_CA_1_evaluation,CA,1
2,HOBBIES_1_010_CA_1_evaluation,CA,1
3,HOBBIES_1_012_CA_1_evaluation,CA,1
4,HOBBIES_1_015_CA_1_evaluation,CA,1
...,...,...,...
47735392,FOODS_3_823_WI_3_evaluation,WI,1969
47735393,FOODS_3_824_WI_3_evaluation,WI,1969
47735394,FOODS_3_825_WI_3_evaluation,WI,1969
47735395,FOODS_3_826_WI_3_evaluation,WI,1969


In [44]:
bbb = grid_df_2["d"].values

In [45]:
result = []

for b in bbb:
    result.append("d_"+str(b))

In [46]:
grid_df_2["d"] = result

In [47]:
grid_df_2

Unnamed: 0,id,state_id,d
0,HOBBIES_1_008_CA_1_evaluation,CA,d_1
1,HOBBIES_1_009_CA_1_evaluation,CA,d_1
2,HOBBIES_1_010_CA_1_evaluation,CA,d_1
3,HOBBIES_1_012_CA_1_evaluation,CA,d_1
4,HOBBIES_1_015_CA_1_evaluation,CA,d_1
...,...,...,...
47735392,FOODS_3_823_WI_3_evaluation,WI,d_1969
47735393,FOODS_3_824_WI_3_evaluation,WI,d_1969
47735394,FOODS_3_825_WI_3_evaluation,WI,d_1969
47735395,FOODS_3_826_WI_3_evaluation,WI,d_1969


In [49]:
grid_df = grid_df[["id", "state_id", "d"]]

In [50]:
grid_df = pd.concat([grid_df, grid_df_2])

In [53]:
grid_df = grid_df.drop_duplicates(["id", "state_id", "d"])

In [54]:
grid_df.shape

(60034810, 3)

In [55]:
30490 * 1969

60034810

In [56]:
grid_df = pd.merge(grid_df[["id", "state_id", "d"]], calendar_df[["date", "wday", "month", "year", "d"]], how="left", on="d")
grid_df['date'] = pd.to_datetime(grid_df['date'], format='%Y-%m-%d')
grid_df = reduce_mem_usage(grid_df)
del calendar_df

Mem. usage decreased to 1776.35 Mb (39.2% reduction)


In [57]:
grid_df.tail()

Unnamed: 0,id,state_id,d,date,wday,month,year
60034805,FOODS_3_823_WI_3_evaluation,WI,d_1969,2016-06-19,2,6,2016
60034806,FOODS_3_824_WI_3_evaluation,WI,d_1969,2016-06-19,2,6,2016
60034807,FOODS_3_825_WI_3_evaluation,WI,d_1969,2016-06-19,2,6,2016
60034808,FOODS_3_826_WI_3_evaluation,WI,d_1969,2016-06-19,2,6,2016
60034809,FOODS_3_827_WI_3_evaluation,WI,d_1969,2016-06-19,2,6,2016


In [58]:
grid_df = make_holiday(grid_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [59]:
%%time

grid_df = days_to_next_holiday(grid_df)
grid_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 26.5 s, sys: 13.8 s, total: 40.3 s
Wall time: 19.3 s


In [60]:
%%time

grid_df = days_from_prev_holiday(grid_df)
grid_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 32.1 s, sys: 19.5 s, total: 51.6 s
Wall time: 21.4 s


In [61]:
California().holidays(2011)

[(datetime.date(2010, 12, 31), 'New year (Observed)'),
 (datetime.date(2011, 1, 1), 'New year'),
 (datetime.date(2011, 1, 17), 'Birthday of Martin Luther King, Jr.'),
 (datetime.date(2011, 2, 21), "Washington's Birthday"),
 (datetime.date(2011, 3, 31), 'Cesar Chavez Day'),
 (datetime.date(2011, 5, 30), 'Memorial Day'),
 (datetime.date(2011, 7, 4), 'Independence Day'),
 (datetime.date(2011, 9, 5), 'Labor Day'),
 (datetime.date(2011, 11, 11), 'Veterans Day'),
 (datetime.date(2011, 11, 24), 'Thanksgiving Day'),
 (datetime.date(2011, 11, 25), 'Thanksgiving Friday'),
 (datetime.date(2011, 12, 25), 'Christmas Day'),
 (datetime.date(2011, 12, 26), 'Christmas Day (Observed)')]

In [62]:
print(grid_df['days_to_next_holiday'].unique())
print(sorted(grid_df['days_to_next_holiday'].value_counts().items()))

[ 23 121  22 120  21 119  20 118  19 117  18 116  17 115  16 114  15 113
  14 112  13 111  12 110  11 109  10 108   9 107   8 106   7 105   6 104
   5 103   4 102   3 101   2 100   1  99  38  98  37  97  36  96  35  95
  34  94  33  93  32  92  31  91  30  90  29  50  89  28  49  88  27  48
  87  26  47  86  25  46  85  24  45  84  44  83  43  82  42  81  41  80
  40  79  39  78  77  76  75  74  73  72  71  70  69  68  67  66  65  64
  63  62  61  60  59  58  57  56  55  54  53  52  51 133 132 131 130 129
 128 127 126 125 124 123 122]
[(1, 2225770), (2, 1679999), (3, 1670852), (4, 1661705), (5, 1652558), (6, 1643411), (7, 1539745), (8, 1481814), (9, 1481814), (10, 1463520), (11, 1433030), (12, 1405589), (13, 1405589), (14, 1365952), (15, 1329364), (16, 1271433), (17, 1231796), (18, 1161669), (19, 1122032), (20, 1100689), (21, 1061052), (22, 1061052), (23, 1051905), (24, 1021415), (25, 1012268), (26, 993974), (27, 963484), (28, 923847), (29, 817132), (30, 807985), (31, 795789), (32, 786

In [63]:
grid_df[grid_df["days_to_next_holiday"] == 133]

Unnamed: 0,id,state_id,d,date,wday,month,year,is_holiday,is_workingday,days_to_next_holiday,days_from_prev_holiday
10753823,HOBBIES_1_001_WI_1_evaluation,WI,d_353,2012-01-16,3,1,2012,1,0,133,14
10753824,HOBBIES_1_002_WI_1_evaluation,WI,d_353,2012-01-16,3,1,2012,1,0,133,14
10753825,HOBBIES_1_003_WI_1_evaluation,WI,d_353,2012-01-16,3,1,2012,1,0,133,14
10753826,HOBBIES_1_004_WI_1_evaluation,WI,d_353,2012-01-16,3,1,2012,1,0,133,14
10753827,HOBBIES_1_005_WI_1_evaluation,WI,d_353,2012-01-16,3,1,2012,1,0,133,14
...,...,...,...,...,...,...,...,...,...,...,...
55369835,FOODS_3_823_WI_3_evaluation,WI,d_1816,2016-01-18,3,1,2016,1,0,133,17
55369836,FOODS_3_824_WI_3_evaluation,WI,d_1816,2016-01-18,3,1,2016,1,0,133,17
55369837,FOODS_3_825_WI_3_evaluation,WI,d_1816,2016-01-18,3,1,2016,1,0,133,17
55369838,FOODS_3_826_WI_3_evaluation,WI,d_1816,2016-01-18,3,1,2016,1,0,133,17


In [64]:
Wisconsin().holidays(2012)

[(datetime.date(2012, 1, 1), 'New year'),
 (datetime.date(2012, 1, 2), 'New year (Observed)'),
 (datetime.date(2012, 1, 16), 'Birthday of Martin Luther King, Jr.'),
 (datetime.date(2012, 5, 28), 'Memorial Day'),
 (datetime.date(2012, 7, 4), 'Independence Day'),
 (datetime.date(2012, 9, 3), 'Labor Day'),
 (datetime.date(2012, 11, 11), 'Veterans Day'),
 (datetime.date(2012, 11, 12), 'Veterans Day (Observed)'),
 (datetime.date(2012, 11, 22), 'Thanksgiving Day'),
 (datetime.date(2012, 12, 24), 'Christmas Eve'),
 (datetime.date(2012, 12, 25), 'Christmas Day'),
 (datetime.date(2012, 12, 31), 'New Years Eve')]

In [65]:
%%time

grid_df = near_day_is_holiday(grid_df)
grid_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 26.6 s, sys: 14.7 s, total: 41.3 s
Wall time: 25.1 s


In [66]:
grid_df = monday_or_friday(grid_df)
grid_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,state_id,d,date,wday,month,year,is_holiday,is_workingday,days_to_next_holiday,days_from_prev_holiday,next_day_is_holiday,prev_day_is_holiday,monday_or_friday
0,HOBBIES_1_001_CA_1_evaluation,CA,d_1,2011-01-29,1,1,2011,0,0,23,12,0,0,0
1,HOBBIES_1_002_CA_1_evaluation,CA,d_1,2011-01-29,1,1,2011,0,0,23,12,0,0,0
2,HOBBIES_1_003_CA_1_evaluation,CA,d_1,2011-01-29,1,1,2011,0,0,23,12,0,0,0
3,HOBBIES_1_004_CA_1_evaluation,CA,d_1,2011-01-29,1,1,2011,0,0,23,12,0,0,0
4,HOBBIES_1_005_CA_1_evaluation,CA,d_1,2011-01-29,1,1,2011,0,0,23,12,0,0,0


In [67]:
%%time

feature_columns = ["id", "d",
                   "is_holiday",
                   "is_workingday",
                   "days_to_next_holiday",
                   "days_from_prev_holiday",
                   "next_day_is_holiday",
                   "prev_day_is_holiday",
                   "monday_or_friday"]
grid_df = grid_df[feature_columns]

grid_df['d'] = grid_df['d'].apply(lambda x: x[2:]).astype(np.int16)

grid_df["id"] = grid_df["id"].astype('category')
grid_df["is_holiday"] = grid_df["is_holiday"].astype('category')
grid_df["is_workingday"] = grid_df["is_workingday"].astype('category')
grid_df["next_day_is_holiday"] = grid_df["next_day_is_holiday"].astype('category')
grid_df["prev_day_is_holiday"] = grid_df["prev_day_is_holiday"].astype('category')
grid_df["monday_or_friday"] = grid_df["monday_or_friday"].astype('category')

grid_df["days_to_next_holiday"] = grid_df["days_to_next_holiday"].astype(np.int8) 
grid_df["days_from_prev_holiday"] = grid_df["days_from_prev_holiday"].astype(np.int8) 

grid_df.to_pickle('holiday_features.pkl')

CPU times: user 1min 6s, sys: 58.5 s, total: 2min 4s
Wall time: 56.8 s
