In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import warnings
warnings.filterwarnings(action = 'ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/competitive-data-science-predict-future-sales/items.csv
/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv
/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
/kaggle/input/competitive-data-science-predict-future-sales/shops.csv
/kaggle/input/competitive-data-science-predict-future-sales/test.csv


In [3]:
df_item_categories = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')
df_items = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')
df_sales = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')
df_shops = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')
df_test = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/test.csv')
df_submission = pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv')

## Common method

In [4]:
def downcast(df: pd.DataFrame, verbose = True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = 'integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast = 'float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Memory usage decreased {:5.2f}Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
        
    return df

## Preprocessing

### Downcase all dataframe

In [5]:
df_list = [df_item_categories, df_items, df_sales, df_shops, df_test]
for df in df_list:
    df = downcast(df)

Memory usage decreased  0.00Mb to  0.00 Mb (39.9% reduction)
Memory usage decreased  0.51Mb to  0.23 Mb (54.2% reduction)
Memory usage decreased 134.39Mb to 61.60 Mb (54.2% reduction)
Memory usage decreased  0.00Mb to  0.00 Mb (38.6% reduction)
Memory usage decreased  4.90Mb to  1.43 Mb (70.8% reduction)


### Remove outlier

In [6]:
df_sales = df_sales.query('item_price > 0 and item_price < 50000')
df_sales = df_sales.query('item_cnt_day > 0 and item_cnt_day < 1000')

In [7]:
df_shops['city'] = df_shops['shop_name'].apply(lambda x: x.split()[0])
df_shops.city.unique()

array(['!Якутск', 'Адыгея', 'Балашиха', 'Волжский', 'Вологда', 'Воронеж',
       'Выездная', 'Жуковский', 'Интернет-магазин', 'Казань', 'Калуга',
       'Коломна', 'Красноярск', 'Курск', 'Москва', 'Мытищи', 'Н.Новгород',
       'Новосибирск', 'Омск', 'РостовНаДону', 'СПб', 'Самара', 'Сергиев',
       'Сургут', 'Томск', 'Тюмень', 'Уфа', 'Химки', 'Цифровой', 'Чехов',
       'Якутск', 'Ярославль'], dtype=object)

In [8]:
df_shops.loc[df_shops['city'] == '!Якутск', 'city'] = 'Якутск'

In [9]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_shops['city'] = label_encoder.fit_transform(df_shops['city'])
df_shops = df_shops.drop('shop_name', axis = 1)
df_shops.head()

Unnamed: 0,shop_id,city
0,0,29
1,1,29
2,2,0
3,3,1
4,4,2


### Create items features

In [10]:
df_items = df_items.drop(['item_name'], axis = 1)
df_items['first_sale_month'] = df_sales.groupby('item_id')['date_block_num'].min()
df_items.head()

Unnamed: 0,item_id,item_category_id,first_sale_month
0,0,40,20.0
1,1,76,15.0
2,2,40,19.0
3,3,40,18.0
4,4,40,20.0


In [13]:
df_items.isna().sum()

item_id               0
item_category_id      0
first_sale_month    368
dtype: int64

In [15]:
df_items['first_sale_month'] = df_items['first_sale_month'].fillna(34)
df_items.isna().sum()

item_id             0
item_category_id    0
first_sale_month    0
dtype: int64

In [1]:
from itertools import product

train = []
for i in df_sales['date_block_num'].unique():
    all_shop = df_sales.loc[df_sales['date_block_num']== i, 'shop_id'].unique()
    all_item = df_sales.loc[df_sales['date_block_num']== i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))
    
idx_features = ['date_block_num', 'shop_id', 'item_id']
df_train = pd.DataFrame(np.vstack(train), columns = idx_features)

group = df_sales.groupby(idx_features).agg({'item_cnt_day': 'sum'}).reset_index()
group.rename(columns = {'item_cnt_day':'monthly_sales_cnt'}, inplace = True)

df_train = df_train.merge(group, on=idx_features, how = 'left')

df_test['date_block_num'] = 34
df_all = pd.concat([df_train, df_test.drop('ID', axis = 1)],
                   ignore_index = True,
                   keys = idx_features)
df_all = df_all.fillna(0)

df_all = df_all.merge(df_shops, on = 'shop_id', how = 'left')
df_all = df_all.merge(df_items, on = 'item_id', how = 'left')
df_all = df_all.merge(df_item_categories, on = 'item_category_id', how = 'left')

df_all = df_all.drop(['shop_name', 'item_name', 'item_category_name'], axis = 1)

# Split train and test
df_train = df_all[df_all['date_block_num'] < 34]
df_test = df_all[df_all['date_block_num'] == 34]
df_test.drop(['monthly_sales_cnt'], axis = 1, inplace = True)

print(df_train.head())
print(df_test.head())


NameError: name 'df_sales' is not defined

In [None]:
df_train = downcast(df_train)
df_test = downcast(df_test)

df_valid = df_train[df_train.date_block_num == 33].reset_index(drop = True)
df_train = df_train[df_train.date_block_num < 33].reset_index(drop = True)

In [None]:
params = {
    'device_type': 'gpu',
    'metric': 'rmse',
    'num_leaves': 255,
    'learning_rate': 0.01,
    'force_col_wise': True,
    'random_state': 42,
}

cat_features = ['shop_id', 'item_category_id']
X_cols = ['date_block_num', 'shop_id', 'item_id', 'item_category_id']
y_col = 'monthly_sales_cnt'

dtrain = lgb.Dataset(df_train[X_cols], df_train[y_col])
dvalid = lgb.Dataset(df_valid[X_cols], df_valid[y_col])

lgb_model = lgb.train(params = params,
                      train_set = dtrain, 
                      valid_sets = (dtrain, dvalid),
                      num_boost_round = 500,
                      categorical_feature= cat_features)

preds = lgb_model.predict(df_test[X_cols]).clip(0, 20)
df_submission['item_cnt_month'] = preds
df_submission.to_csv('/kaggle/working/submission.csv', index=False)