In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings(action = 'ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitive-data-science-predict-future-sales/items.csv
/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv
/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
/kaggle/input/competitive-data-science-predict-future-sales/shops.csv
/kaggle/input/competitive-data-science-predict-future-sales/test.csv


In [2]:
df_item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')

## Common method

In [3]:
def downcast(df: pd.DataFrame, verbose = True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = 'integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast = 'float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage decreased {:5.2f}Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
        
    return df

## Preprocessing

### Downcase all dataframe

In [4]:
df_list = [df_item_categories, df_items, df_sales, df_shops, df_test]
for df in df_list:
    df = downcast(df)

Memory usage decreased  0.00Mb to  0.00 Mb (39.9% reduction)
Memory usage decreased  0.51Mb to  0.23 Mb (54.2% reduction)
Memory usage decreased 134.39Mb to 61.60 Mb (54.2% reduction)
Memory usage decreased  0.00Mb to  0.00 Mb (38.6% reduction)
Memory usage decreased  4.90Mb to  1.43 Mb (70.8% reduction)


### Remove outlier

In [5]:
df_sales = df_sales.query('item_price > 0 and item_price < 50000')
df_sales = df_sales.query('item_cnt_day > 0 and item_cnt_day < 1000')

In [6]:
df_shops['city'] = df_shops['shop_name'].apply(lambda x: x.split()[0])
df_shops.city.unique()

array(['!Якутск', 'Адыгея', 'Балашиха', 'Волжский', 'Вологда', 'Воронеж',
       'Выездная', 'Жуковский', 'Интернет-магазин', 'Казань', 'Калуга',
       'Коломна', 'Красноярск', 'Курск', 'Москва', 'Мытищи', 'Н.Новгород',
       'Новосибирск', 'Омск', 'РостовНаДону', 'СПб', 'Самара', 'Сергиев',
       'Сургут', 'Томск', 'Тюмень', 'Уфа', 'Химки', 'Цифровой', 'Чехов',
       'Якутск', 'Ярославль'], dtype=object)

In [7]:
df_shops.loc[df_shops['city'] == '!Якутск', 'city'] = 'Якутск'

In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_shops['city'] = label_encoder.fit_transform(df_shops['city'])
df_shops = df_shops.drop('shop_name', axis = 1)
df_shops.head()

Unnamed: 0,shop_id,city
0,0,29
1,1,29
2,2,0
3,3,1
4,4,2


### Create items features

In [9]:
df_items = df_items.drop(['item_name'], axis = 1)
df_items['first_sale_month'] = df_sales.groupby('item_id')['date_block_num'].min()
df_items.head()

Unnamed: 0,item_id,item_category_id,first_sale_month
0,0,40,20.0
1,1,76,15.0
2,2,40,19.0
3,3,40,18.0
4,4,40,20.0


In [10]:
df_items.isna().sum()

item_id               0
item_category_id      0
first_sale_month    368
dtype: int64

In [11]:
df_items['first_sale_month'] = df_items['first_sale_month'].fillna(34)
df_items.isna().sum()

item_id             0
item_category_id    0
first_sale_month    0
dtype: int64

In [12]:
df_item_categories['main_category'] = df_item_categories.item_category_name.apply(lambda x : x.split()[0])
df_item_categories['main_category'].value_counts()

main_category
Игры          14
Книги         13
Подарки       12
Игровые        8
Аксессуары     7
Программы      6
Музыка         6
Кино           5
Карты          5
Чистые         2
Служебные      2
Доставка       1
Билеты         1
PC             1
Элементы       1
Name: count, dtype: int64

In [13]:
df_item_categories['main_category'] = df_item_categories.main_category.apply(lambda x : 'etc' if len(df_item_categories[df_item_categories.main_category == x]) < 5 else x)
df_item_categories['main_category'].value_counts()

main_category
Игры          14
Книги         13
Подарки       12
etc            8
Игровые        8
Аксессуары     7
Музыка         6
Программы      6
Кино           5
Карты          5
Name: count, dtype: int64

### Labeling main_category feature 

In [14]:
label_encoder = LabelEncoder()
df_item_categories['main_category'] = label_encoder.fit_transform(df_item_categories['main_category'])
df_item_categories = df_item_categories.drop('item_category_name', axis = 1)

### Merge all dataset

In [15]:
from itertools import product
import gc

train = []
for i in df_sales['date_block_num'].unique():
    all_shop = df_sales.loc[df_sales['date_block_num']== i, 'shop_id'].unique()
    all_item = df_sales.loc[df_sales['date_block_num']== i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))
    
idx_features = ['date_block_num', 'shop_id', 'item_id']
df_train = pd.DataFrame(np.vstack(train), columns = idx_features)

group = df_sales.groupby(idx_features).agg({'item_cnt_day': 'sum', 'item_price':'mean'}).reset_index()
group.rename(columns = {'item_cnt_day':'monthly_sales_cnt', 'item_price':'mean_item_price'}, inplace = True)
df_train = df_train.merge(group, on=idx_features, how = 'left')

group = df_sales.groupby(idx_features).agg({'item_cnt_day': 'count'})
group = group.reset_index()
group.rename(columns = {'item_cnt_day': 'item_sale_cnt'})
df_train = df_train.merge(group, on=idx_features, how = 'left')

df_test['date_block_num'] = 34
df_all = pd.concat([df_train, df_test.drop('ID', axis = 1)],
                   ignore_index = True,
                   keys = idx_features)
df_all = df_all.fillna(0)

df_all = df_all.merge(df_shops, on = 'shop_id', how = 'left')
df_all = df_all.merge(df_items, on = 'item_id', how = 'left')
df_all = df_all.merge(df_item_categories, on = 'item_category_id', how = 'left')

df_all = downcast(df_all)
del df_shops, df_items, df_item_categories
gc.collect()



Memory usage decreased 645.66Mb to 201.11 Mb (68.9% reduction)


0

In [16]:
def add_mean_features(df, mean_features, idx_features):
    assert (idx_features[0] == 'date_block_num') and len(idx_features) in [2,3]
    
    if len(idx_features) == 2:
        feature_name = 'mean_sales_by_' + idx_features[1] 
    else:
        feature_name = 'mean_sales_by_' + idx_features[1] + '_' + idx_features[2]
    print(feature_name)
    group = df.groupby(idx_features).agg({'monthly_sales_cnt': 'mean'})
    group = group.reset_index()
    group = group.rename(columns = {'monthly_sales_cnt': feature_name})
    df = df.merge(group, on = idx_features, how = 'left')
    df = downcast(df, verbose=False)
    
    mean_features.append(feature_name)
    
    del group
    gc.collect()
    
    return df, mean_features

In [17]:
df_all

Unnamed: 0,date_block_num,shop_id,item_id,monthly_sales_cnt,mean_item_price,item_cnt_day,city,item_category_id,first_sale_month,main_category
0,0,59,22154,1,999.0,1,30,37,0,5
1,0,59,2552,0,0.0,0,30,58,0,7
2,0,59,2554,0,0.0,0,30,58,0,7
3,0,59,2555,0,0.0,0,30,56,0,7
4,0,59,2564,0,0.0,0,30,59,0,7
...,...,...,...,...,...,...,...,...,...,...
11098655,34,45,18454,0,0.0,0,20,55,23,7
11098656,34,45,16188,0,0.0,0,20,64,32,8
11098657,34,45,15757,0,0.0,0,20,55,0,7
11098658,34,45,19648,0,0.0,0,20,40,23,5


In [18]:
item_mean_features = []
df_all, item_mean_features = add_mean_features(df = df_all,
                                              mean_features = item_mean_features,
                                              idx_features=['date_block_num', 'item_id'])
df_all, item_mean_features = add_mean_features(df = df_all,
                                              mean_features = item_mean_features,
                                              idx_features=['date_block_num', 'item_id', 'city'])

shop_mean_features = []
df_all, shop_mean_features = add_mean_features(df = df_all,
                                              mean_features = shop_mean_features,
                                              idx_features=['date_block_num', 'item_id', 'item_category_id'])

mean_sales_by_item_id
mean_sales_by_item_id_city
mean_sales_by_item_id_item_category_id


In [19]:
def add_lag_features(df, lag_features_to_clip, idx_features, lag_feature, nlag=3, clip=False):
    df_temp = df[idx_features + [lag_feature]].copy()
    for i in range(1, nlag + 1):
        lag_feature_name = lag_feature + '_lag_' + str(i)
        df_temp.columns = idx_features + [lag_feature_name]
        df_temp['date_block_num'] += 1
        df = df.merge(df_temp.drop_duplicates(), on = idx_features, how = 'left')
        df[lag_feature_name] = df[lag_feature_name].fillna(0)
        
        if clip:
            lag_features_to_clip.append(lag_feature_name)
    
    df = downcast(df, False)
    del df_temp
    gc.collect()
    
    return df, lag_features_to_clip

In [20]:
lag_features_to_clip = []
idx_features = ['date_block_num', 'shop_id', 'item_id']

df_all, lag_features_to_clip = add_lag_features(df = df_all,
                                                lag_features_to_clip = lag_features_to_clip, 
                                                idx_features = idx_features,
                                                lag_feature = 'monthly_sales_cnt',
                                                nlag=3,
                                                clip=True)

df_all, lag_features_to_clip = add_lag_features(df = df_all,
                                                lag_features_to_clip = lag_features_to_clip, 
                                                idx_features = idx_features,
                                                lag_feature = 'item_cnt_day',
                                                nlag=3)

df_all, lag_features_to_clip = add_lag_features(df = df_all,
                                                lag_features_to_clip = lag_features_to_clip, 
                                                idx_features = idx_features,
                                                lag_feature = 'mean_item_price',
                                                nlag=3)

In [21]:
for item_mean_feature in item_mean_features:
    df_all, lag_features_to_clip = add_lag_features(df = df_all,
                                                lag_features_to_clip = lag_features_to_clip, 
                                                idx_features = idx_features,
                                                lag_feature = item_mean_feature,
                                                nlag=3,
                                                clip = True)
df_all = df_all.drop(item_mean_features, axis = 1)

for shop_mean_feature in shop_mean_features:
    df_all, lag_features_to_clip = add_lag_features(df = df_all,
                                                lag_features_to_clip = lag_features_to_clip, 
                                                idx_features = ['date_block_num', 'shop_id', 'item_category_id'],
                                                lag_feature = shop_mean_feature,
                                                nlag=3,
                                                clip = True)
df_all = df_all.drop(shop_mean_features, axis = 1)

In [22]:
df_all = df_all.drop(df_all[df_all.date_block_num< 3].index)

In [23]:
# Split train and test
df_train = df_all[df_all['date_block_num'] < 34]
df_test = df_all[df_all['date_block_num'] == 34]
df_test.drop(['monthly_sales_cnt'], axis = 1, inplace = True)

In [24]:
df_train

Unnamed: 0,date_block_num,shop_id,item_id,monthly_sales_cnt,mean_item_price,item_cnt_day,city,item_category_id,first_sale_month,main_category,...,item_cnt_day_lag_3,mean_item_price_lag_1,mean_item_price_lag_2,mean_item_price_lag_3,mean_sales_by_item_id_lag_1,mean_sales_by_item_id_lag_2,mean_sales_by_item_id_lag_3,mean_sales_by_item_id_city_lag_1,mean_sales_by_item_id_city_lag_2,mean_sales_by_item_id_city_lag_3
1122386,3,25,8092,3,149.0,3,13,40,0,5,...,4,149.0,0.0,149.0,0.586957,0.630435,1.444444,0.666667,0.666667,2.000000
1122387,3,25,7850,3,199.0,3,13,30,0,3,...,5,199.0,199.0,199.0,1.869565,1.456522,3.088889,2.916667,2.083333,2.916667
1122388,3,25,8051,3,30.0,2,13,66,0,8,...,0,0.0,0.0,0.0,0.043478,0.130435,0.133333,0.000000,0.333333,0.333333
1122389,3,25,8088,1,199.0,1,13,55,0,7,...,0,199.0,199.0,0.0,0.130435,0.043478,0.066667,0.166667,0.083333,0.000000
1122390,3,25,8089,1,199.0,1,13,55,0,7,...,0,0.0,199.0,0.0,0.478261,0.260870,0.177778,0.666667,0.250000,0.083333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10884455,33,21,7635,0,0.0,0,13,64,33,8,...,0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10884456,33,21,7638,0,0.0,0,13,64,24,8,...,0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10884457,33,21,7640,0,0.0,0,13,64,33,8,...,0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
10884458,33,21,7632,0,0.0,0,13,64,23,8,...,0,0.0,0.0,0.0,0.023810,0.000000,0.023256,0.142857,0.000000,0.125000


In [25]:
df_train = downcast(df_train)
df_test = downcast(df_test)

df_valid = df_train[df_train.date_block_num == 33].reset_index(drop = True)
df_train = df_train[df_train.date_block_num < 33].reset_index(drop = True)

Memory usage decreased 819.27Mb to 819.27 Mb (0.0% reduction)
Memory usage decreased 17.57Mb to 15.53 Mb (11.6% reduction)


In [26]:
params = {
    'device_type': 'gpu',
    'metric': 'rmse',
    'num_leaves': 255,
    'learning_rate': 0.005,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 5,
    'force_col_wise': True,
    'random_state': 42,
    "early_stopping_round": 150
}

cat_features = ['shop_id', 'item_id', 'item_category_id', 'main_category', 'city']

X_train = df_train

dtrain = lgb.Dataset(df_train.drop('monthly_sales_cnt', axis = 1), df_train['monthly_sales_cnt'])
dvalid = lgb.Dataset(df_valid.drop('monthly_sales_cnt', axis = 1), df_valid['monthly_sales_cnt'])

In [None]:
lgb_model = lgb.train(params = params,
                      train_set = dtrain, 
                      valid_sets = (dtrain, dvalid),
                      num_boost_round = 1500,
                      categorical_feature= cat_features)

In [None]:
preds = lgb_model.predict(df_test).clip(0, 20)
df_submission['item_cnt_month'] = preds
df_submission.to_csv('/kaggle/working/submission.csv', index=False)