In [66]:
import os
import gc
import re
import json
import pickle
import datetime
from tqdm import tqdm
from typing import Union

import numpy as np
import pandas as pd
pd.options.display.max_columns = None

from workalendar.usa.texas import Texas
from workalendar.usa.california import California
from workalendar.usa.wisconsin import Wisconsin

from typing import Union

import seaborn
import matplotlib.pyplot as plt
plt.style.use("seaborn-darkgrid")

from scipy.stats import linregress

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# custom funcs
from script import WRMSSEEvaluator
from script import cache_result
from script import reduce_mem_usage
from script import load_pickle, dump_pickle
from script import get_groups

In [59]:
from tqdm import tqdm

In [None]:
df = pd.read_pickle('features/all_train_data.pkl')
sales = df['sales']

In [None]:
drop_cols = ['id', 'd', 'sales', 'date', 'wm_yr_wk']
drop_cols = [f for f in df.columns.tolist() if f in drop_cols]

In [None]:
%%time
df.drop(drop_cols, axis=1, inplace=True)

In [None]:
print(df.shape)
df.head()

### 分散の値を用いて特徴量を選択

In [None]:
from sklearn.feature_selection import VarianceThreshold

In [None]:
numeric_data = df.select_dtypes(include=['float16', 'float32', 'float64'])
numeric_features = numeric_data.columns.values

In [None]:
numeric_data.head()

In [None]:
drop_cols = []
variance_thresh = 0.0

for num_feat in tqdm(numeric_features):
    var = df[num_feat].var()
    if var <= variance_thresh:
        drop_cols.append(num_feat)

In [None]:
calendar = pd.read_pickle('../data/reduced/calendar.pkl')

work_cals = {'CA': California(), 'TX': Texas(), 'WI': Wisconsin()}
# 休日フラグを未来に向かって rolling.sum() する。
holiday_df = pd.DataFrame({'date': pd.date_range(start='2011-01-29', end='2016-07-30')})
for state, work_cal in work_cals.items():
    holiday_df[f'nwd_{state}'] = [int(work_cal.is_working_day(d)) for d in holiday_df.date]
reversed_holiday_df = holiday_df.sort_values(by='date', ascending=False)

In [None]:
drop_cols[:5]

### ラベルデータとの相関の小ささを元に特徴量を削減

In [None]:
drop_cols = []
thresh = 0.99

for feat in tqdm(numeric_features):
    corr = np.corrcoef(df['sales'].values, df[feat].values)[0, 1]
    if corr > thresh:
        counter += 1
        drop_cols.append(feat)
        if verbose:
            print(f'{counter}: {feat} - Correlation: {corr}')

### 特徴量同士の相関の強さによって特徴量を削減

In [80]:
def get_high_corr(df, features, thresh=0.99, verbose=False):
    drop_cols = []
    counter = 0
    for feat_a in tqdm(features):
        for feat_b in features:
            if feat_a != feat_b and feat_a not in drop_cols and feat_b not in drop_cols:
                corr = np.corrcoef(df[feat_a], df[feat_b])[0, 1]
                if corr > thresh:
                    counter += 1
                    drop_cols.append(feat_b)
                    if verbose:
                        print(f'{counter}: {feat_a} {feat_b} - Correlation: {corr}')
    return drop_cols


In [81]:
high_corr_cols = get_high_corr(df, numeric_features, verbose=True)

  4%|▍         | 8/208 [58:28<24:21:49, 438.55s/it]


KeyboardInterrupt: 