In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../../../Data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Allows viewing of entire column width of a dataframe 
pd.set_option("display.max_colwidth", None)

In [None]:
import gc
import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from fuzzywuzzy import fuzz

### Below are some utility functions that will be used throughout the notebook. 

In [None]:
def reduce_mem_usage(df, silent=True, allow_categorical=True, float_dtype="float32"):
    """ 
    Iterates through all the columns of a dataframe and downcasts the data type
     to reduce memory usage. Can also factorize categorical columns to integer dtype.
    """
    def _downcast_numeric(series, allow_categorical=allow_categorical):
        """
        Downcast a numeric series into either the smallest possible int dtype or a specified float dtype.
        """
        if pd.api.types.is_sparse(series.dtype) is True:
            return series
        elif pd.api.types.is_numeric_dtype(series.dtype) is False:
            if pd.api.types.is_datetime64_any_dtype(series.dtype):
                return series
            else:
                if allow_categorical:
                    return series
                else:
                    codes, uniques = series.factorize()
                    series = pd.Series(data=codes, index=series.index)
                    series = _downcast_numeric(series)
                    return series
        else:
            series = pd.to_numeric(series, downcast="integer")
        if pd.api.types.is_float_dtype(series.dtype):
            series = series.astype(float_dtype)
        return series

    if silent is False:
        start_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print(f"Memory usage of dataframe is: {start_mem:.2f} MB")
    if df.ndim == 1:
        df = _downcast_numeric(df)
    else:
        for col in df.columns:
            df.loc[:, col] = _downcast_numeric(df.loc[:,col])
    if silent is False:
        end_mem = np.sum(df.memory_usage()) / 1024 ** 2
        print(f"Memory usage after optimization is: {end_mem:.2f} MB")
        print(f"Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%")

    return df


def shrink_mem_new_cols(matrix, oldcols=None, allow_categorical=False):
    # Calls reduce_mem_usage on columns which have not yet been optimized
    if oldcols is not None:
        newcols = matrix.columns.difference(oldcols)
    else:
        newcols = matrix.columns
    matrix.loc[:,newcols] = reduce_mem_usage(matrix.loc[:,newcols], allow_categorical=allow_categorical)
    oldcols = matrix.columns  # This is used to track which columns have already been downcast
    return matrix, oldcols


def list_if_not(s, dtype=str):
    # Puts a variable in a list if it is not already a list
    if type(s) not in (dtype, list):
        raise TypeError
    if (s != "") & (type(s) is not list):
        s = [s]
    return s

### Load the required data.

In [None]:
path="../../../Data/future_sales/"
items = pd.read_csv(path+"items.csv")
shops = pd.read_csv(path+"shops.csv")
train = pd.read_csv(path+"sales_train.csv")
test = pd.read_csv(path+"test.csv")

In [None]:
print(f"{'_'*70}")
display(items.head())
display(items.shape)
print(f"{'_'*70}")
display(shops.head())
display(shops.shape)
print(f"{'_'*70}")
display(train.head())
display(train.shape)
print(f"{'_'*70}")
display(test.head())
display(test.shape)
print(f"{'_'*70}")

### Convert date column to datetime to allow date operations.

In [None]:
train["date"] = pd.to_datetime(train["date"], format="%d.%m.%Y")

### Data Cleaning
- Minor data cleaning of the dataframe.

In [None]:
# Merge some duplicate shops
train["shop_id"] = train["shop_id"].replace({0: 57, 1: 58, 11: 10, 40: 39})
# Keep only shops that are in the test set
train = train.loc[train.shop_id.isin(test["shop_id"].unique()), :]
# Drop training items with extreme or negative prices or sales counts
train = train[(train["item_price"] > 0) & (train["item_price"] < 50000)]
train = train[(train["item_cnt_day"] > 0) & (train["item_cnt_day"] < 1000)]

### Preprocessing

In [None]:
def create_testlike_train(sales_train, test=None):
    """
    This function attempts to convert train dataset to a format that resembles how test dataset 
    looks like i.e test dataset appears to be a cartesian product shops with every combination
    of item.
    """
    indexlist = []
    # This for loop does a cartesian product of every combination of shop and item each month
    for i in sales_train.date_block_num.unique():
        x = itertools.product(
            [i],
            sales_train.loc[sales_train.date_block_num == i].shop_id.unique(),
            sales_train.loc[sales_train.date_block_num == i].item_id.unique(),
        )
        indexlist.append(np.array(list(x)))
    df = pd.DataFrame(
        data=np.concatenate(indexlist, axis=0),
        columns=["date_block_num", "shop_id", "item_id"],
    )

    # Add revenue column to sales_train
    sales_train["item_revenue_day"] = sales_train["item_price"] * sales_train["item_cnt_day"]
    # Aggregate item_id / shop_id item_cnts and revenue at the month level
    sales_train_grouped = sales_train.groupby(["date_block_num", "shop_id", "item_id"]).agg(
        item_cnt_month=pd.NamedAgg(column="item_cnt_day", aggfunc="sum"),
        item_revenue_month=pd.NamedAgg(column="item_revenue_day", aggfunc="sum"),
    )

    # Merge the grouped data with the index
    df = df.merge(
        sales_train_grouped, how="left", on=["date_block_num", "shop_id", "item_id"],
    )

    if test is not None:
        test["date_block_num"] = 34
        test["date_block_num"] = test["date_block_num"].astype(np.int8)
        test["shop_id"] = test.shop_id.astype(np.int8)
        test["item_id"] = test.item_id.astype(np.int16)
        test = test.drop("ID",axis=1)
        
        df = pd.concat([df, test[["date_block_num", "shop_id", "item_id"]]])

    # Fill empty item_cnt entries with 0
    df.item_cnt_month = df.item_cnt_month.fillna(0)
    df.item_revenue_month = df.item_revenue_month.fillna(0)

    return df

In [None]:
matrix = create_testlike_train(train, test)
del(test)

In [None]:
matrix = reduce_mem_usage(matrix, silent=False)
oldcols = matrix.columns

# Feature Engineering
- Predictor columns are added to the matrix

### Item name groups with fuzzywuzzy
Items in the items table are ordered alphabetically according to the item_name field, so that similar items are generally listed next to each other. For example, the first two items in the table below are the same game "Fuse" for two different consoles, followed by two different licensing options for the same internet security program. This ordering can be used to help group related items together.

In [None]:
items.query("item_id>3564").head(5)

In [None]:
# import re

# from fuzzywuzzy import fuzz


# def add_item_name_groups(matrix, train, items, sim_thresh, feature_name="item_name_group"):
#     def partialmatchgroups(items, sim_thresh=sim_thresh):
#         def strip_brackets(string):
#             string = re.sub(r"\(.*?\)", "", string)
#             string = re.sub(r"\[.*?\]", "", string)
#             return string

#         items = items.copy()
#         items["nc"] = items.item_name.apply(strip_brackets)
#         items["ncnext"] = np.concatenate((items["nc"].to_numpy()[1:], np.array([""])))

#         def partialcompare(s):
#             return fuzz.partial_ratio(s["nc"], s["ncnext"])

#         items["partialmatch"] = items.apply(partialcompare, axis=1)
#         # Assign groups
#         grp = 0
#         for i in range(items.shape[0]):
#             items.loc[i, "partialmatchgroup"] = grp
#             if items.loc[i, "partialmatch"] < sim_thresh:
#                 grp += 1
#         items = items.drop(columns=["nc", "ncnext", "partialmatch"])
#         return items

#     items = partialmatchgroups(items)
#     items = items.rename(columns={"partialmatchgroup": feature_name})
#     items = items.drop(columns="partialmatchgroup", errors="ignore")

#     items[feature_name] = items[feature_name].apply(str)
#     items[feature_name] = items[feature_name].factorize()[0]
#     matrix = matrix.merge(items[["item_id", feature_name]], on="item_id", how="left")
#     train = train.merge(items[["item_id", feature_name]], on="item_id", how="left")
#     return matrix, train


# matrix, train = add_item_name_groups(matrix, train, items, 65)