In [1]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings(action='ignore') # Ignore warning message

# date path
data_path = '../Data/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')



Data Downcasting


In [2]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

54.2% compressed
38.5% compressed
54.2% compressed
39.8% compressed
70.8% compressed


### Feature Engineering I - handling sales_train, shops, items, item_categories

In [3]:
# Extract data with a item_price greater than 0
sales_train = sales_train[sales_train['item_price'] > 0]
# Extract data with a item_priceof less than 50,000
sales_train = sales_train[sales_train['item_price'] < 50000]
# Extract data with item_cnt_day greater than 0
sales_train = sales_train[sales_train['item_cnt_day'] > 0]
# Extract data with item_cnt_day less than 1,000
sales_train = sales_train[sales_train['item_cnt_day'] < 1000]

In [4]:
#  Modify shop_id in sales_train data
sales_train.loc[sales_train['shop_id'] == 0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id'] == 1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id'] == 10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id'] == 39, 'shop_id'] = 40

#  Modify shop_id in test data
test.loc[test['shop_id'] == 0, 'shop_id'] = 57
test.loc[test['shop_id'] == 1, 'shop_id'] = 58
test.loc[test['shop_id'] == 10, 'shop_id'] = 11
test.loc[test['shop_id'] == 39, 'shop_id'] = 40

In [5]:
# Leaking to imporve performance
unique_test_shop_id = test['shop_id'].unique()
unique_test_item_id = test['item_id'].unique()
sales_train = sales_train[sales_train['shop_id'].isin(unique_test_shop_id)]
sales_train = sales_train[sales_train['item_id'].isin(unique_test_item_id)]

Shops: create derived features and encode

In [6]:
shops['city'] = shops['shop_name'].apply(lambda x: x.split()[0])
shops['city'].unique()

array(['!Якутск', 'Адыгея', 'Балашиха', 'Волжский', 'Вологда', 'Воронеж',
       'Выездная', 'Жуковский', 'Интернет-магазин', 'Казань', 'Калуга',
       'Коломна', 'Красноярск', 'Курск', 'Москва', 'Мытищи', 'Н.Новгород',
       'Новосибирск', 'Омск', 'РостовНаДону', 'СПб', 'Самара', 'Сергиев',
       'Сургут', 'Томск', 'Тюмень', 'Уфа', 'Химки', 'Цифровой', 'Чехов',
       'Якутск', 'Ярославль'], dtype=object)

In [7]:
shops.loc[shops['city'] =='!Якутск', 'city'] = 'Якутск'

In [8]:
from sklearn.preprocessing import LabelEncoder

# Create Label Encoder
label_encoder = LabelEncoder()
# City Feature Label Encoding 
shops['city'] = label_encoder.fit_transform(shops['city'])

In [9]:
# Remove shop_name feature
shops = shops.drop('shop_name', axis=1)

shops.head()

Unnamed: 0,shop_id,city
0,0,29
1,1,29
2,2,0
3,3,1
4,4,2


itmes: Create derived features

In [10]:
# Remove item_name feature
items = items.drop(['item_name'], axis=1)

In [11]:
# Create the date the product was first sold as a feature
items['first_sale_date'] = sales_train.groupby('item_id').agg({'date_block_num': 'min'})['date_block_num']

items.head()

Unnamed: 0,item_id,item_category_id,first_sale_date
0,0,40,
1,1,76,
2,2,40,
3,3,40,
4,4,40,


In [12]:
items[items['first_sale_date'].isna()]

Unnamed: 0,item_id,item_category_id,first_sale_date
0,0,40,
1,1,76,
2,2,40,
3,3,40,
4,4,40,
...,...,...,...
22160,22160,40,
22161,22161,37,
22165,22165,31,
22168,22168,62,


In [13]:
# Replace NaN of first_sale_date with 34
items['first_sale_date'] = items['first_sale_date'].fillna(34)

Create item_categories derived feature and encode

In [14]:
# Extract the first word of the item_categories_name into category
item_categories['category'] = item_categories['item_category_name'].apply(lambda x: x.split()[0])  

In [15]:
item_categories['category'].value_counts()

category
Игры          14
Книги         13
Подарки       12
Игровые        8
Аксессуары     7
Музыка         6
Программы      6
Карты          5
Кино           5
Служебные      2
Чистые         2
PC             1
Билеты         1
Доставка       1
Элементы       1
Name: count, dtype: int64

In [16]:
def make_etc(x):
    if len(item_categories[item_categories['category']==x]) >= 5:
        return x
    else:
        return 'etc'

# Replace with 'etc' if category count is less than 5
item_categories['category'] = item_categories['category'].apply(make_etc)

In [17]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id,category
0,PC - Гарнитуры/Наушники,0,etc
1,Аксессуары - PS2,1,Аксессуары
2,Аксессуары - PS3,2,Аксессуары
3,Аксессуары - PS4,3,Аксессуары
4,Аксессуары - PSP,4,Аксессуары


In [18]:
# Create Label Encoder
label_encoder = LabelEncoder()
# Category Feature Label Encoding 
item_categories['category'] = label_encoder.fit_transform(item_categories['category'])

# Remove item_category_name feature
item_categories = item_categories.drop('item_category_name', axis=1)

In [19]:
from itertools import product

train = []
# Create date_block_num, sop_id, item_id combination
for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num']==i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num']==i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['date_block_num', 'shop_id', 'item_id'] # base features
train = pd.DataFrame(np.vstack(train), columns=idx_features)

In [20]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id
0,0,59,22154
1,0,59,2574
2,0,59,2607
3,0,59,2614
4,0,59,2808


In [21]:
from itertools import product

train = []
# Create date_block_num, sop_id, item_id combination
for i in sales_train['date_block_num'].unique():
    all_shop = sales_train.loc[sales_train['date_block_num']==i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['date_block_num']==i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['date_block_num', 'shop_id', 'item_id'] # base features
train = pd.DataFrame(np.vstack(train), columns=idx_features)

In [22]:
group = sales_train.groupby(idx_features, as_index=False).agg({'item_cnt_day': 'sum',
                                               'item_price': 'mean'})

group = group.rename(columns={'item_cnt_day': 'item_cnt_month', 'item_price': 'item_price_mean'})
group.head()
train = train.merge(group, on=idx_features, how='left')

In [23]:
train.head()

Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean
0,0,59,22154,1.0,999.0
1,0,59,2574,2.0,399.0
2,0,59,2607,,
3,0,59,2614,,
4,0,59,2808,15.0,999.0


In [24]:
import gc

# group variable garbage collection
del group
gc.collect()

0

In [25]:
# Add a feature for the number of items sold
group = sales_train.groupby(idx_features).agg({'item_cnt_day': 'count'})
group = group.reset_index()
group = group.rename(columns={'item_cnt_day': 'item_count'})

train = train.merge(group, on=idx_features, how='left')

In [26]:
# Garbage collection
del group, sales_train
gc.collect()
train.head()


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean,item_count
0,0,59,22154,1.0,999.0,1.0
1,0,59,2574,2.0,399.0,2.0
2,0,59,2607,,,
3,0,59,2614,,,
4,0,59,2808,15.0,999.0,9.0


Concatenate test data, Merge remaining data

In [31]:
display(test.head())
print()
# Set test data date_block_num to 34
test['date_block_num'] = 34

# Concatenate train and test
all_data = pd.concat([train, test.drop('ID', axis=1)],
                     ignore_index=True,
                     keys=idx_features)
# Replace NaN with 0
all_data = all_data.fillna(0)

display(all_data.head())
display(all_data.tail())

Unnamed: 0,ID,shop_id,item_id,date_block_num
0,0,5,5037,34
1,1,5,5320,34
2,2,5,5233,34
3,3,5,5232,34
4,4,5,5268,34





Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean,item_count
0,0,59,22154,1.0,999.0,1.0
1,0,59,2574,2.0,399.0,2.0
2,0,59,2607,0.0,0.0,0.0
3,0,59,2614,0.0,0.0,0.0
4,0,59,2808,15.0,999.0,9.0


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean,item_count
2952842,34,45,18454,0.0,0.0,0.0
2952843,34,45,16188,0.0,0.0,0.0
2952844,34,45,15757,0.0,0.0,0.0
2952845,34,45,19648,0.0,0.0,0.0
2952846,34,45,969,0.0,0.0,0.0


In [32]:
# Merge other data
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')

# Data downcasting
all_data = downcast(all_data)

# Garbage collection
del shops, items, item_categories
gc.collect();

68.9% compressed


Feature summary

In [34]:
def resumetable(df):
    print(f'Data Shape: {df.shape}')
    summary = pd.DataFrame(df.dtypes, columns=['Dtypes'])
    summary['Null'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First_values'] = df.loc[0].values
    summary['Second_values'] = df.loc[1].values
    summary['Third_values'] = df.loc[2].values
    
    return summary

In [35]:
resumetable(all_data)

Data Shape: (2952847, 10)


Unnamed: 0,Dtypes,Null,Uniques,First_values,Second_values,Third_values
date_block_num,int8,0,35,0.0,0.0,0.0
shop_id,int8,0,42,59.0,59.0,59.0
item_id,int16,0,5100,22154.0,2574.0,2607.0
item_cnt_month,int16,0,383,1.0,2.0,0.0
item_price_mean,float64,0,23262,999.0,399.0,0.0
item_count,int8,0,40,1.0,2.0,0.0
city,int8,0,28,30.0,30.0,30.0
item_category_id,int8,0,62,37.0,55.0,55.0
first_sale_date,int8,0,35,0.0,0.0,0.0
category,int8,0,10,5.0,7.0,7.0


In [37]:
all_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
date_block_num,2952847.0,21.943367,9.438631,0.0,15.0,24.0,30.0,34.0
shop_id,2952847.0,31.332922,17.822284,2.0,16.0,34.0,47.0,59.0
item_id,2952847.0,10836.313386,6233.255724,30.0,5128.0,10911.0,16057.0,22167.0
item_cnt_month,2952847.0,0.550419,5.01159,0.0,0.0,0.0,0.0,1305.0
item_price_mean,2952847.0,185.656114,794.959022,0.0,0.0,0.0,0.0,42990.0
item_count,2952847.0,0.415856,1.333602,0.0,0.0,0.0,0.0,57.0
city,2952847.0,15.334964,8.310242,0.0,10.0,15.0,22.0,30.0
item_category_id,2952847.0,46.228789,16.200762,0.0,37.0,47.0,57.0,83.0
first_sale_date,2952847.0,9.908104,10.319897,0.0,0.0,7.0,19.0,34.0
category,2952847.0,5.712392,1.956827,0.0,5.0,6.0,7.0,9.0
