source: https://www.kaggle.com/szmnkrisz97/point-to-uncertainty-different-ranges-per-level

In [1]:
import sys
import os
import pathlib
import gc
import pandas as pd
pd.set_option('display.max_columns', 500)
# pd.set_option('display.max_rows', 500)
import numpy as np
import math
import random
import pickle
import time
import psutil
import seaborn as sns
import matplotlib.pyplot as plt

# custom import
import scipy.stats  as stats

# constant variables for helper functions

In [2]:
N_CORES = psutil.cpu_count()     # Available CPU cores
print(f"N_CORES: {N_CORES}")

N_CORES: 36


# function nicely diplaying a head of Pandas DataFrame

In [3]:
import IPython

def display(*dfs, head=True):
    for df in dfs:
        IPython.display.display(df.head() if head else df)

# function fixing random seeds

In [4]:
def seed_everything(seed=0):
    """Sets seed to make all processes deterministic     # type: int
    
    """
    random.seed(seed)
    np.random.seed(seed)

SEED = 42
seed_everything(SEED)    

# function processing df in multiprocess

In [5]:
def run_df_in_multiprocess(func, t_split):
    """Process ds in Multiprocess
    
    """
    num_cores = np.min([N_CORES,len(t_split)])
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, t_split), axis=1)
    pool.close()
    pool.join()
    return df

# other helper functions

In [6]:
def get_memory_usage():
    """メモリ使用量を確認するためのシンプルな「メモリプロファイラ」
    
    """
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def merge_by_concat(df1, df2, merge_on):
    """
    dtypesを失わないための連結による結合
    
    """
    
    merged_gf = df1[merge_on]
    merged_gf = merged_gf.merge(df2, on=merge_on, how='left')
    new_columns = [col for col in list(merged_gf) if col not in merge_on]
    df1 = pd.concat([df1, merged_gf[new_columns]], axis=1)
    return df1


#  constant variables for data import

In [7]:
# take the data from M5_accuracy
_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic"])
_OUTPUT_UNCERTAINTY_DIR = os.path.sep.join(["data", "Point_to_uncertainty_different_ranges_per_level"])
_OUTPUT_DIR = _OUTPUT_UNCERTAINTY_DIR

_CALENDAR_CSV_FILE = "calendar.csv"
_SAMPLE_SUBMISSION_CSV_FILE = "sample_submission.csv"
_SALES_TRAIN_EVALUATION_CSV_FILE = "sales_train_evaluation.csv"
_SELL_PRICES_CSV_FILE = "sell_prices.csv"

# _ACCURACY_DATA_DIR = os.path.sep.join(["data", "M5_Three_shades_of_Dark_Darker_magic", "bk", "0.47353"])
_ACCURACY_RESULT_FILE = "submission_v5_evaluation.csv"

# import data

In [8]:
def reduce_mem_usage(df, verbose=True):
    """
    reduce the memory usage of the given dataframe.
    https://qiita.com/hiroyuki_kageyama/items/02865616811022f79754
    
    Args:
        df: Dataframe
        verbose: 
        
    Returns:
        df, whose memory usage is reduced.

    Raises:
        None
    """
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns: #columns毎に処理
        col_type = df[col].dtypes
        if col_type in numerics: #numericsのデータ型の範囲内のときに処理を実行. データの最大最小値を元にデータ型を効率的なものに変更
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def read_csv_data(directory, file_name):
    print('Reading files...')
    df = pd.read_csv(os.path.sep.join([str(directory), _DATA_DIR, file_name]))
    df = reduce_mem_usage(df)
    print('{} has {} rows and {} columns'.format(file_name, df.shape[0], df.shape[1]))
    
    return df

# read csv data

In [9]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent
print(f"parent_dir: {parent_dir}")

df_sales_train_evaluation = read_csv_data(parent_dir, _SALES_TRAIN_EVALUATION_CSV_FILE)

parent_dir: /home/ec2-user/SageMaker
Reading files...
Mem. usage decreased to 96.13 Mb (78.8% reduction)
sales_train_evaluation.csv has 30490 rows and 1947 columns


In [10]:
# import accuracy result
df_accuracy_result = read_csv_data(parent_dir, _ACCURACY_RESULT_FILE)
df_accuracy_result = reduce_mem_usage(df_accuracy_result)


Reading files...
Mem. usage decreased to  3.72 Mb (72.4% reduction)
submission_v5_evaluation.csv has 60980 rows and 29 columns
Mem. usage decreased to  3.72 Mb (0.0% reduction)


In [11]:
# df_sample_submission = read_csv_data(parent_dir, _SAMPLE_SUBMISSION_CSV_FILE)

In [12]:
# print(f"df_sales_train_evaluation: {df_sales_train_evaluation.head()}")
# print(f"df_sample_submission: {df_sample_submission.head()}")
print(f"df_sales_train_evaluation: {df_sales_train_evaluation}")

print(f"df_accuracy_result: {df_accuracy_result}")

df_sales_train_evaluation:                                   id        item_id    dept_id   cat_id  \
0      HOBBIES_1_001_CA_1_evaluation  HOBBIES_1_001  HOBBIES_1  HOBBIES   
1      HOBBIES_1_002_CA_1_evaluation  HOBBIES_1_002  HOBBIES_1  HOBBIES   
2      HOBBIES_1_003_CA_1_evaluation  HOBBIES_1_003  HOBBIES_1  HOBBIES   
3      HOBBIES_1_004_CA_1_evaluation  HOBBIES_1_004  HOBBIES_1  HOBBIES   
4      HOBBIES_1_005_CA_1_evaluation  HOBBIES_1_005  HOBBIES_1  HOBBIES   
...                              ...            ...        ...      ...   
30485    FOODS_3_823_WI_3_evaluation    FOODS_3_823    FOODS_3    FOODS   
30486    FOODS_3_824_WI_3_evaluation    FOODS_3_824    FOODS_3    FOODS   
30487    FOODS_3_825_WI_3_evaluation    FOODS_3_825    FOODS_3    FOODS   
30488    FOODS_3_826_WI_3_evaluation    FOODS_3_826    FOODS_3    FOODS   
30489    FOODS_3_827_WI_3_evaluation    FOODS_3_827    FOODS_3    FOODS   

      store_id state_id  d_1  d_2  d_3  d_4  d_5  d_6  d_7  d_8  d_9  d_

# merge the dataframe

In [13]:
sub = df_accuracy_result.merge(df_sales_train_evaluation[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]], on = "id")
sub["_all_"] = "Total"
print(f"sub.shape: {sub.shape}")
print(f"sub: {sub}")

sub.shape: (30490, 35)
sub:                                     id        F1        F2        F3  \
0          FOODS_3_419_CA_1_evaluation  1.695312  1.933594  1.494141   
1      HOUSEHOLD_2_496_TX_3_evaluation  0.264160  0.355225  0.341797   
2          FOODS_2_156_TX_1_evaluation  0.378174  0.391846  0.409180   
3          FOODS_3_749_TX_3_evaluation  0.494629  0.473877  0.509766   
4      HOUSEHOLD_2_312_CA_3_evaluation  1.900391  1.676758  1.666992   
...                                ...       ...       ...       ...   
30485      FOODS_2_375_CA_3_evaluation  0.971680  0.930664  0.937988   
30486      FOODS_2_235_WI_1_evaluation  1.680664  1.568359  1.576172   
30487      FOODS_3_281_TX_1_evaluation  2.486328  2.521484  2.511719   
30488      FOODS_2_043_WI_1_evaluation  1.856445  1.766602  1.516602   
30489      FOODS_2_373_CA_1_evaluation  0.646484  0.555176  0.558105   

             F4        F5        F6        F7        F8        F9       F10  \
0      1.420898  1.517578  2

# Different ratios for different aggregation levels
The higher the aggregation level, the more confident we are in the point prediction --> lower coef, relatively smaller range of quantiles

In [14]:
qs = np.array([0.005,0.025,0.165,0.25, 0.5, 0.75, 0.835, 0.975, 0.995])

def get_ratios(coef=0.15):
    qs2 = np.log(qs/(1-qs))*coef
#     print(f"qs2: {qs2}")
    
#     累積密度分布(cumulative distribution function)
    ratios = stats.norm.cdf(qs2)
#     print(f" ratios: { ratios}")
    
#     ratios[4] is 0.5
    ratios /= ratios[4]
#     print(f" ratios: { ratios}")
    
    ratios = pd.Series(ratios, index=qs)
#     return ratios.round(3)
    return ratios


In [15]:
# coef between 0.05 and 0.24 is used, probably suboptimal values for now: <-グリッドサーチ手作りでベストなcoefの組み合わせ出せる？
level_coef_dict = {"id": get_ratios(coef=0.3), "item_id": get_ratios(coef=0.15),
                   "dept_id": get_ratios(coef=0.08), "cat_id": get_ratios(coef=0.07),
                   "store_id": get_ratios(coef=0.08), "state_id": get_ratios(coef=0.07), "_all_": get_ratios(coef=0.05),
                   ("state_id", "item_id"): get_ratios(coef=0.19),  ("state_id", "dept_id"): get_ratios(coef=0.1),
                    ("store_id","dept_id") : get_ratios(coef=0.11), ("state_id", "cat_id"): get_ratios(coef=0.08),
                    ("store_id","cat_id"): get_ratios(coef=0.1)
                  }

print(f"level_coef_dict: {level_coef_dict}")

level_coef_dict: {'id': 0.005    0.112288
0.025    0.271738
0.165    0.626651
0.250    0.741715
0.500    1.000000
0.750    1.258285
0.835    1.373349
0.975    1.728262
0.995    1.887712
dtype: float64, 'item_id': 0.005    0.427198
0.025    0.582639
0.165    0.807833
0.250    0.869108
0.500    1.000000
0.750    1.130892
0.835    1.192167
0.975    1.417361
0.995    1.572802
dtype: float64, 'dept_id': 0.005    0.671956
0.025    0.769457
0.165    0.896789
0.250    0.929965
0.500    1.000000
0.750    1.070035
0.835    1.103211
0.975    1.230543
0.995    1.328044
dtype: float64, 'cat_id': 0.005    0.710987
0.025    0.797604
0.165    0.909631
0.250    0.938701
0.500    1.000000
0.750    1.061299
0.835    1.090369
0.975    1.202396
0.995    1.289013
dtype: float64, 'store_id': 0.005    0.671956
0.025    0.769457
0.165    0.896789
0.250    0.929965
0.500    1.000000
0.750    1.070035
0.835    1.103211
0.975    1.230543
0.995    1.328044
dtype: float64, 'state_id': 0.005    0.710987
0.025    0.7

For the the lowest level (i.e. "id") (30490 series), the smallest and biggest quantiles are 20% and 180% of the point prediction. For categories ("cat_id": 3 series), the model will be way more confident: the smallest quantile will be 71%, the biggest will be 129% of the point prediction.
    
    

In [16]:
def quantile_coefs(quantiles, level):
#     特定レベルのquantile ratioを取得
    ratios = level_coef_dict[level]

#     各probablity intervalの閾値(quantiles)に対する倍率を取得
    quantile_values = ratios.loc[quantiles].values
#     print(f"quantile_values: {quantile_values}")
#     print(f"quantile_values[:, None]: {quantile_values[:, None]}")
    
    return quantile_values

def get_group_preds(pred, level, cols):
#     levelごとにcols(各Fの値)を合計
    df = pred.groupby(level)[cols].sum()
    print(f"df.shape: {df.shape}")
        
    q = np.repeat(qs, len(df))    
    print(f"q.shape: {q.shape}")
    print(f"q at get_group_preds: {q}")
    
#     quantileの数は9
    df = pd.concat([df]*9, axis=0, sort=False)
    df.reset_index(inplace = True)
    print(f"amplified df: {df}")

#     accuracyにおける予測値の予測区間を、倍率を掛けることにより計算。[:, None]で行持ちを列持ちに変換
    df[cols] *= quantile_coefs(q, level)[:, None]

    if level != "id":
#         uncertainty 用の提出ファイルに合わせるためのlabelの変更
        df["id"] = [f"{lev}_X_{q:.3f}_evaluation" for lev, q in zip(df[level].values, q)]
    else:
        df["id"] = [f"{lev.replace('_evaluation', '')}_{q:.3f}_evaluation" for lev, q in zip(df[level].values, q)]
    
    df = df[["id"]+list(cols)]
    return df

def get_couple_group_preds(pred, level1, level2):
    df = pred.groupby([level1, level2])[cols].sum()
    print(f"df.shape: {df.shape}")

    q = np.repeat(qs, len(df))
    df = pd.concat([df]*9, axis=0, sort=False)
    df.reset_index(inplace = True)
        
    df[cols] *= quantile_coefs(q, (level1, level2))[:, None]
    df["id"] = [f"{lev1}_{lev2}_{q:.3f}_evaluation" for lev1,lev2, q in 
                zip(df[level1].values,df[level2].values, q)]
    df = df[["id"]+list(cols)]
    return df

In [17]:
levels = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "_all_"]
couples = [("state_id", "item_id"),  ("state_id", "dept_id"),("store_id","dept_id"), ("state_id", "cat_id"),("store_id","cat_id")]
cols = [f"F{i}" for i in range(1, 29)]

df = []
for level in levels :
    df.append(get_group_preds(sub, level, cols))
    
for level1,level2 in couples:
    df.append(get_couple_group_preds(sub, level1, level2))

print(f"appended df: {df}")

# 縦方向に連結
df = pd.concat(df, axis=0, sort=False)

# inplace=Trueで元のオブジェクトを直接変更
df.reset_index(drop=True, inplace=True)

print(f"df: {df}")

df.shape: (30490, 28)
q.shape: (274410,)
q at get_group_preds: [0.005 0.005 0.005 ... 0.995 0.995 0.995]
amplified df:                                      id        F1        F2        F3  \
0           FOODS_1_001_CA_1_evaluation  0.804688  0.761230  0.768555   
1           FOODS_1_001_CA_2_evaluation  0.887695  0.960938  0.868652   
2           FOODS_1_001_CA_3_evaluation  0.771973  0.749512  0.791992   
3           FOODS_1_001_CA_4_evaluation  0.294434  0.288330  0.298096   
4           FOODS_1_001_TX_1_evaluation  0.453613  0.530273  0.513184   
...                                 ...       ...       ...       ...   
274405  HOUSEHOLD_2_516_TX_2_evaluation  0.225464  0.210327  0.195923   
274406  HOUSEHOLD_2_516_TX_3_evaluation  0.163330  0.155151  0.148804   
274407  HOUSEHOLD_2_516_WI_1_evaluation  0.089722  0.093628  0.087036   
274408  HOUSEHOLD_2_516_WI_2_evaluation  0.066101  0.066895  0.063599   
274409  HOUSEHOLD_2_516_WI_3_evaluation  0.092529  0.090027  0.085632   

    

df.shape: (9147, 28)
df.shape: (21, 28)
df.shape: (70, 28)
df.shape: (9, 28)
df.shape: (30, 28)
appended df: [                                           id        F1        F2        F3  \
0           FOODS_1_001_CA_1_0.005_evaluation  0.090357  0.085477  0.086300   
1           FOODS_1_001_CA_2_0.005_evaluation  0.099678  0.107902  0.097539   
2           FOODS_1_001_CA_3_0.005_evaluation  0.086683  0.084161  0.088931   
3           FOODS_1_001_CA_4_0.005_evaluation  0.033061  0.032376  0.033473   
4           FOODS_1_001_TX_1_0.005_evaluation  0.050935  0.059543  0.057624   
...                                       ...       ...       ...       ...   
274405  HOUSEHOLD_2_516_TX_2_0.995_evaluation  0.425611  0.397037  0.369846   
274406  HOUSEHOLD_2_516_TX_3_0.995_evaluation  0.308320  0.292881  0.280899   
274407  HOUSEHOLD_2_516_WI_1_0.995_evaluation  0.169369  0.176743  0.164299   
274408  HOUSEHOLD_2_516_WI_2_0.995_evaluation  0.124780  0.126278  0.120056   
274409  HOUSEHOLD_2_5

# substitute the calculation result to the submission format

In [18]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent

# Reading competition sample submission and merging our predictions
submission_df = pd.read_csv(os.path.sep.join([str(parent_dir), _OUTPUT_UNCERTAINTY_DIR, _SAMPLE_SUBMISSION_CSV_FILE]))
submission_df = reduce_mem_usage(submission_df)

submission_ids_df = submission_df[["id"]]

# submission_df = pd.read_csv(ORIGINAL+_SAMPLE_SUBMISSION_CSV_FILE)[['id']]
my_submission_df = submission_ids_df.merge(df, on=['id'], how='left').fillna(0)
print(f"my_submission_df:{my_submission_df}")

Mem. usage decreased to 26.47 Mb (84.5% reduction)
my_submission_df:                                       id        F1        F2        F3  \
0                Total_X_0.005_validation  0.000000  0.000000  0.000000   
1                   CA_X_0.005_validation  0.000000  0.000000  0.000000   
2                   TX_X_0.005_validation  0.000000  0.000000  0.000000   
3                   WI_X_0.005_validation  0.000000  0.000000  0.000000   
4                 CA_1_X_0.005_validation  0.000000  0.000000  0.000000   
...                                   ...       ...       ...       ...   
771115  FOODS_3_823_WI_3_0.995_evaluation  0.912517  0.986256  0.997316   
771116  FOODS_3_824_WI_3_0.995_evaluation  0.410633  0.403950  0.402337   
771117  FOODS_3_825_WI_3_0.995_evaluation  1.342967  1.086725  1.196411   
771118  FOODS_3_826_WI_3_0.995_evaluation  2.066528  2.389135  2.027815   
771119  FOODS_3_827_WI_3_0.995_evaluation  2.612195  2.545830  2.009381   

              F4        F5     

# export train/test result as csv

In [19]:
parent_dir = pathlib.Path(os.path.abspath(os.curdir)).parent

VER = 5

_EXPORT_FILE_NAME = 'submission_v'+str(VER)+'_evaluation.csv'
print("csv data export start")
my_submission_df.to_csv(os.path.sep.join([str(parent_dir), _OUTPUT_DIR, _EXPORT_FILE_NAME]), index=False)
print('csv data export finished. Size:', my_submission_df.shape)

csv data export start
csv data export finished. Size: (771120, 29)
