In [None]:
# libraries to use
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from itertools import product
from sklearn.preprocessing import LabelEncoder


warnings.filterwarnings(action='ignore')

data_path = '../../../Data/future_sales/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

In [None]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        percent_mem_change = 100 * (start_mem - end_mem) / start_mem
        print(f"The percent memory change is: {percent_mem_change}")
    
    return df

all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

In [None]:
sns.boxplot(sales_train['item_price'])

In [None]:
sns.boxplot(sales_train['item_cnt_day'])

In [None]:
# This removes outliers in the data. 
sales_train = sales_train.loc[(sales_train['item_price'] > 0) & (sales_train['item_price'] < 100000)]
sales_train = sales_train.loc[(sales_train['item_cnt_day'] > 0) & (sales_train['item_cnt_day'] < 900)]

In [None]:
shops["shop_first_name"] = shops["shop_name"].apply(lambda x: x.split()[0])

In [None]:
shops["shop_first_name"].unique()

In [None]:
shops.loc[shops["shop_first_name"] =='!Якутск', "shop_first_name"] = 'Якутск'

In [None]:
# Changes string variables with numeric type. 
label_encoder = LabelEncoder()
shops["shop_first_name"] = label_encoder.fit_transform(shops["shop_first_name"])

In [None]:
shops = shops.drop("shop_name", axis=1)

In [None]:
shops.head()

In [None]:
items = items.drop(["item_name"], axis=1)

In [None]:
items["item_date_block"] = sales_train.groupby("item_id").agg({"date_block_num": 'min'})["date_block_num"]
items.head()

In [None]:
items.loc[items["item_date_block"].isna()]

In [None]:
items["item_date_block"] = items["item_date_block"].fillna(34)

### Item Category exploration

In [None]:
# Extracts first name of the items.
item_categories["first_item_category_name"] = item_categories["item_category_name"].apply(lambda x: x.split()[0])  

In [None]:
item_categories["first_item_category_name"].value_counts()

In [None]:
def make_etc(x):
    if len(item_categories[item_categories["first_item_category_name"]==x]) >= 5:
        return x
    else:
        return 'etc'

item_categories["first_item_category_name"] = item_categories["first_item_category_name"].apply(make_etc)

In [None]:
item_categories.head()

In [None]:
label_encoder = LabelEncoder()
item_categories["first_item_category_name"] = label_encoder.fit_transform(item_categories["first_item_category_name"])
item_categories = item_categories.drop("item_category_name", axis=1)

In [None]:
train = []
for i in sales_train["date_block_num"].unique():
    all_shop = sales_train.loc[sales_train["date_block_num"]==i, "shop_id"].unique()
    all_item = sales_train.loc[sales_train["date_block_num"]==i, "item_id"].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ["date_block_num", "shop_id", "item_id"]
train = pd.DataFrame(np.vstack(train), columns=idx_features)