In [1]:
import os
import gc
import re
import json
import pickle
import datetime
from tqdm import tqdm
from typing import Union

import numpy as np
import pandas as pd
pd.options.display.max_columns = None

from typing import Union

import seaborn
import matplotlib.pyplot as plt
plt.style.use("seaborn-darkgrid")

from scipy.stats import linregress

from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

# custom funcs
from script import WRMSSEEvaluator
from script import cache_result
from script import reduce_mem_usage
from script import load_pickle, dump_pickle
from script import get_groups

In [4]:
df = pd.read_pickle('features/melted_and_merged_train.pkl')
df = df[['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'dayofweek', 'sales', 'sell_price']]

In [5]:
print(df.shape)
df.head()

(35093990, 9)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,dayofweek,sales,sell_price
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,4,0,
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,4,0,3.970703
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,4,0,
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,4,1,4.640625
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,4,1,3.080078


In [6]:
df['total_sales'] = df['sales'] * df['sell_price']

In [7]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,dayofweek,sales,sell_price,total_sales
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,4,0,,
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,4,0,3.970703,0.0
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,4,0,,
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,4,1,4.640625,4.640625
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,4,1,3.080078,3.080078


In [13]:
group = 'store_id'
store_ids = df[group].unique()

total_sales_series = df.groupby('id')['total_sales'].transform('sum')

df['total_sales_ratio'] = 0.0
for s_id in store_ids:
    is_target_store = (df[group] == s_id)
    
    df.loc[is_target_store, 'total_sales_ratio'] = total_sales_series / df.loc[is_target_store, 'total_sales'].sum()

In [14]:
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,dayofweek,sales,sell_price,total_sales,total_sales_ratio
0,HOBBIES_1_001_CA_1_evaluation,1437,3,1,0,0,4,0,,,0.000352
1,HOBBIES_1_002_CA_1_evaluation,1438,3,1,0,0,4,0,3.970703,0.0,8.6e-05
2,HOBBIES_1_003_CA_1_evaluation,1439,3,1,0,0,4,0,,,6.2e-05
3,HOBBIES_1_004_CA_1_evaluation,1440,3,1,0,0,4,1,4.640625,4.640625,0.000732
4,HOBBIES_1_005_CA_1_evaluation,1441,3,1,0,0,4,1,3.080078,3.080078,0.000234
