In [3]:
import os
import gc
import re
import json
import pickle
import datetime
from tqdm import tqdm
from typing import Union

import numpy as np
import pandas as pd
pd.options.display.max_columns = 50

from typing import Union

import seaborn
import matplotlib.pyplot as plt
plt.style.use("seaborn-darkgrid")

from scipy.stats import linregress

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

# custom funcs
from script import WRMSSEEvaluator
from script import cache_result
from script import reduce_mem_usage
from script import load_pickle, dump_pickle

In [4]:
from v02000.v02004_baseline import *

In [8]:
def ordered_d_cols(df_cols, is_reverse=False):
    return sorted(df_cols, key=lambda x: int((re.search(r"\d+", x)).group(0)), reverse=is_reverse)

@cache_result(filename='sample_weight', use_cache=True)
def calc_similar_weight():    
    df = pd.read_pickle('features/melted_and_merged_train.pkl')
    # Prepare raw data.
    df = df[['id', 'd', 'sales', 'sell_price']]
    df['sales_value'] = df['sales'] * df['sell_price']
    df.drop(['sell_price'], axis=1, inplace=True)
    # Calculation salse value ratio.
    weight_df = df.pivot(values='sales_value', index='id', columns='d')
    weight_df = weight_df[ordered_d_cols(weight_df.columns)]

    weight_df = weight_df.shift(28, axis=1).rolling(28, axis=1).sum()
    weight_df = weight_df / weight_df.sum(axis=0)

    weight_df = weight_df.reset_index()
    weight_df = pd.melt(weight_df, id_vars='id', var_name='d', value_name='weight').fillna(0)
    # Calculation scale that is Variance of past values.
    scale_df = df.pivot(values='sales', index='id', columns='d')
    scale_df = scale_df[ordered_d_cols(scale_df.columns, is_reverse=False)]
    
    def est_scale(series):
        series = series[~np.isnan(series)][np.argmax(series != 0):]
        if series.shape[0] > 0:
            scale = np.mean(((series[1:] - series[:-1]) ** 2))
        else:
            scale = 1
        return scale
    scale_df = scale_df.rolling(90, min_periods=28, axis=1).apply(est_scale, raw=True)
    scale_df = scale_df.reset_index()
    scale_df = pd.melt(scale_df, id_vars='id', var_name='d', value_name='scale').fillna(0)
    # Merge weight_df and scale_df.
    weight_df = weight_df.merge(scale_df, how='left', on=['id', 'd'])
    weight_df['sample_weight'] = weight_df['weight'] / (weight_df['scale'].map(np.sqrt)+1)
    # Min_Max_Scaling sample weight.
    weight_df['sample_weight'] = (weight_df['sample_weight'] - weight_df['sample_weight'].min()) \
            / weight_df['sample_weight'].max() - weight_df['sample_weight'].min()
    
    return weight_df[['id', 'd', 'sample_weight']]

In [9]:
weight_df = calc_similar_weight()

Load Cached data, features/sample_weight.pkl


In [10]:
weight_df.head()

Unnamed: 0,id,d,sample_weight
0,FOODS_1_001_CA_1_validation,d_1,0.0
1,FOODS_1_001_CA_2_validation,d_1,0.0
2,FOODS_1_001_CA_3_validation,d_1,0.0
3,FOODS_1_001_CA_4_validation,d_1,0.0
4,FOODS_1_001_TX_1_validation,d_1,0.0
